randomfox: (Default)
[personal profile] randomfox
These scripts generate a Top N list for users in the 10 Million Photos Flickr group.

1. ApiKey.pm, a short module containing the API key:


#!perl -w
use strict;

package ApiKey;
use base 'Exporter';
our @EXPORT = qw($api_key $shared_secret $auth_token $groupid);

our $api_key = "";
our $shared_secret = "";
our $auth_token = "";
our $groupid = '20759249@N00';

1;

__END__



2. TenCommon.pm, functions shared by all the scripts:


#!perl -w
use strict;

package TenCommon;
use base 'Exporter';
our @EXPORT = qw(
    FlickrRetry GenPhotoList GetDateaddedRange GetPage
    OpenDB CloseDB AddToDB HighestDateInDB OwnerStatsFromDB
);

use Flickr::API;

use XML::Simple;
use LWP::UserAgent;
use Time::HiRes qw(usleep);

use ApiKey;

my $SLEEPTIME = 500000;

# Query Flickr with retry.
sub FlickrRetry {
    my $method = shift;
    my $param = shift;

    $param->{auth_token} = $auth_token;

    my $retry_count = 0;
    my $response;
    do {
	my $api = new Flickr::API(
	    {
		'key' => $api_key,
		secret => $shared_secret
	    }
	);
	$response = $api->execute_method($method, $param);
	usleep $SLEEPTIME;
    } while $retry_count++ < 5 and not $response->{success};
    $response;
}

# Generate a list of photos from the Flickr query response.
sub GenPhotoList {
    my $response = shift;

    my $xmlp = new XML::Simple;
    my $xm = $xmlp->XMLin($response->{_content}, forcearray=>['photo']);

    my $photos = $xm->{photos};
    print "Page $photos->{page} of $photos->{pages}...\n";

    my $photolist = $photos->{photo};
    my @photoarr;

    for my $id (keys %{$photolist}) {
	my $photo = $photolist->{$id};
	$photo->{id} = $id;
	$photo->{url} = "http://www.flickr.com/photos/$photo->{owner}/$photo->{id}";
	push @photoarr, $photo;
    }
    ( $photos->{pages}, \@photoarr );
}

# Get highest and lowest values of the dateadded field.
sub GetDateaddedRange {
    my $photolist = shift;
    my $lodate = 0x7FFFFFFF;
    my $hidate = 0;

    for my $photo (@$photolist) {
	my $date = $photo->{dateadded};
	$date < $lodate and $lodate = $date;
	$date > $hidate and $hidate = $date;
    }

    ($lodate, $hidate);
}

sub GetPage {
    my $groupid = shift;
    my $pagenum = shift;
    my $pagelen = shift;

    my $response = FlickrRetry("flickr.groups.pools.getPhotos",
	{
	    group_id => $groupid,
	    per_page => $pagelen,
	    page => $pagenum
	});

    die "Error: $response->{error_message}\n" unless $response->{success};

    my ($totalpages, $photolist) = GenPhotoList($response);
    my ($lodate, $hidate) = GetDateaddedRange($photolist);

    ( $totalpages, $lodate, $hidate, $photolist );
}


use DBI;

my $DBNAME = "10pool.db";
my $TBLNAME = "statstable";

sub OpenDB {
    my $dbh = DBI->connect("dbi:SQLite:dbname=$DBNAME", "", "",
	{ RaiseError => 1, AutoCommit => 1 });

    # Create the table if it does not exist.
    my $sth = $dbh->prepare("PRAGMA table_info($TBLNAME)")
	or die $dbh->errstr;
    $sth->execute
	or die $sth->errstr;

    my @row = $sth->fetchrow_array;
    if (!@row) {
	$sth->err and
	    die $sth->errstr;
	print "Table $TBLNAME not present\n";
	$sth = $dbh->prepare("CREATE TABLE $TBLNAME (" .
	    "key INTEGER PRIMARY KEY, " .
	    "startpage INTEGER, endpage INTEGER, " .
	    "lowdate INTEGER, highdate INTEGER, " . 
	    "stats TEXT, " . 
	    "timestamp INTEGER" .
	    ")") or die $dbh->errstr;
	$sth->execute
	    or die $sth->errstr;
    }
    $dbh;
}

sub CloseDB {
    my $dbh = shift;
    $dbh->disconnect;
}

sub AddToDB {
    my $dbh = shift;

    my $startpage = shift;
    my $endpage = shift;
    my $lowdate = shift;
    my $highdate = shift;
    my $stats = shift;

    my $sth = $dbh->prepare("INSERT INTO $TBLNAME (startpage, endpage, lowdate, highdate, stats, timestamp) VALUES (?, ?, ?, ?, ?, ?)") 
	or die $dbh->errstr;
    $sth->execute($startpage, $endpage, $lowdate, $highdate, $stats, time) 
	or die $sth->errstr;
}

sub HighestDateInDB {
    my $dbh = shift;

    my $highestdate = 0;
    my $toppage = 0;

    my $sth = $dbh->prepare("SELECT endpage, highdate FROM $TBLNAME WHERE highdate = (SELECT max(highdate) FROM $TBLNAME)")
	or die $dbh->errstr;
    $sth->execute
	or die $sth->errstr;

    my @row;
    if (@row = $sth->fetchrow_array) {
	($toppage, $highestdate) = @row;
    }
    elsif ($sth->err) {
	die $sth->errstr;
    }

    ($highestdate, $toppage);
}

sub OwnerStatsFromDB {
    my $dbh = shift;

    my %owners;

    my $sth = $dbh->prepare("SELECT stats FROM $TBLNAME")
	or die $dbh->errstr;
    $sth->execute
	or die $sth->errstr;

    my @row;
    while (@row = $sth->fetchrow_array) {
	for my $line (split(/\n/, $row[0])) {
	    my ($count, $owner, $ownername) = split(/,/, $line, 3);
	    defined $ownername or next;

	    unless (defined $owners{$owner}) {
		$owners{$owner} = { 
		    owner => $owner,
		    name => $ownername,
		    count => 0 
		};
	    }
	    $owners{$owner}{count} += $count;
	}
    }
    $sth->err and die $sth->errstr;

    \%owners;
}

1;

__END__



3. runten2.pl, scan the group pool and update the database incrementally:


#!perl -w
use strict;

use FileHandle;
use File::DosGlob;

use TenCommon;
use ApiKey;

my $pagelen = 500;
my $pageclusterlen = 10;

sub GetTotalPages {
    my ($totalpages, undef, undef, undef) = GetPage($groupid, 1, $pagelen);
    $totalpages;
}

sub GetDateRange {
    my $pagenum = shift;
    my (undef, $lodate, $hidate, undef) = GetPage($groupid, $pagenum, $pagelen);
    ($lodate, $hidate);
}

# Get the highest date from output files in this directory.
sub GetHighestDate_old {
    my $highestdate = 0;
    my $toppage = 0;

    my @filelist = File::DosGlob::glob "x*.htm";
    for my $file (@filelist) {
	my $fh = new FileHandle $file, "r";
	my $firstline = <$fh>;
	if ($firstline =~ /Pages \d+ to (\d+): Dates \d+ to (\d+):/) {
	    my $hipage = $1;
	    my $hidate = $2;
	    if ($hidate > $highestdate) {
		$highestdate = $hidate;
		$toppage = $hipage;
	    }
	}
	$fh->close;
    }

    return ($highestdate, $toppage);
}

sub GetHighestDate {
    my $dbh = OpenDB;
    my ($highestdate, $toppage) = HighestDateInDB $dbh;
    CloseDB $dbh;
    ($highestdate, $toppage);
}

# Do binary search to find the page number containing a specific date.
sub FindPage {
    my $lower = 1;
    my $upper = shift;
    my $date = shift;

    print "Binary search: low=$lower upr=$upper Looking for date $date...\n";

    while (1) {
	my $mid = int(($lower + $upper) / 2);

	my ($lodate, $hidate) = GetDateRange($mid);

	print "Binary search: low=$lower upr=$upper mid=$mid Dates $lodate to $hidate...\n";
	if ($date > $hidate) {
	    $upper = $mid;
	}
	elsif ($date < $lodate) {
	    $lower = $mid;
	}
	else {
	    return $mid;
	}
    }
}

sub main {
    my $totalpages = GetTotalPages;
    print "Total pages = $totalpages\n";

    my ($highestdate, $toppage) = GetHighestDate();
    print "Highest date = $highestdate, Top page = $toppage\n";

    my $highestpage = ($highestdate ? 
	FindPage($totalpages, $highestdate) : 
	$totalpages);

    while (1) {
	# Look through the whole list of files every time in case the most
	# recent run of tenmil.pl failed.
	my ($hidate, $hipage) = GetHighestDate();
	my $endset = $highestpage;
	$hidate > $highestdate and $endset = $hipage - $pageclusterlen;

	print "hidate = $hidate, hipage = $hipage, endset = $endset\n";

	last if $endset <= 0;

	my $startset = $endset - $pageclusterlen + 1;
	$startset < 1 and $startset = 1;
	print "startset = $startset, endset = $endset\n";

	system "perl tenmil.pl $startset $endset $highestdate";
    }
}

main;

__END__



4. tenshort.pl, use Flickr API to correct the stats for the Top 150 and produce the report:


#!perl -w
use strict;

use FileHandle;
use File::DosGlob;

use TenCommon;
use ApiKey;

sub glob_args {
    map { File::DosGlob::glob $_ } @_;
}

# Count a user's photos by setting the page length to 1 and getting the
# number of pages.
sub GetCount {
    my $groupid = shift;
    my $userid = shift;

    my $response = FlickrRetry("flickr.groups.pools.getPhotos",
	{
	    group_id => $groupid,
	    user_id => $userid,
	    per_page => 1
	});
    die "Error: $response->{error_message}\n" unless $response->{success};

    my $xmlp = new XML::Simple;
    my $xm = $xmlp->XMLin($response->{_content});
    my $photos = $xm->{photos};

    $photos->{pages};
}

sub main {
    my $dbh = OpenDB;
    my $owners = OwnerStatsFromDB $dbh;
    CloseDB $dbh;

    my @topusers = (sort { $b->{count} <=> $a->{count} } values %$owners) [0..149];

    my $lineno = 0;
    for my $user (@topusers) {
	$lineno++;
	print "$lineno. Getting photocount for user $user->{name}...\n";
	$user->{count} = GetCount($groupid, $user->{owner});
    }

    @topusers = sort { $b->{count} <=> $a->{count} } @topusers;

    my $fh = new FileHandle "newcount.txt", "w";
    defined $fh or die "Can't open newcount.txt for writing: $!\n";

    $lineno = 0;
    for my $user (@topusers) {
	$lineno++;
	print $fh "$lineno. $user->{name}: $user->{count}\n";
    }

    $fh->close;
}

main;

__END__



5. tenconvert.pl, convert from old flatfile database to SQLite:


#!perl -w
use strict;

# Add datafiles to database.
# Usage: tenconvert.pl x*.htm

use FileHandle;
use File::DosGlob;

use TenCommon;

sub glob_args {
    map { File::DosGlob::glob $_ } @_;
}

sub ProcessFiles {
    my $dbh = shift;

    for my $file (@_) {
	my $fh = new FileHandle $file, "r";
	defined $fh or die "Error opening file $file for reading: $!\n";
	my $line = <$fh>;
	next unless $line =~ /Pages (\d+) to (\d+): Dates (\d+) to (\d+):/;

	my $startpage = $1;
	my $endpage = $2;
	my $lowdate = $3;
	my $highdate = $4;

	my $stats = '';
	while (defined($line = <$fh>)) {
	    $stats .= $line;
	}

	$fh->close;

	AddToDB($dbh, $startpage, $endpage, $lowdate, $highdate, $stats);
    }
}

my $dbh = OpenDB;
# Much faster to turn AutoCommit off if adding many records.
$dbh->{AutoCommit} = 0;
ProcessFiles($dbh, glob_args @ARGV);
$dbh->commit;
CloseDB $dbh;

__END__

From:
Anonymous( )Anonymous This account has disabled anonymous posting.
OpenID( )OpenID You can comment on this post while signed in with an account from many other sites, once you have confirmed your email address. Sign in using OpenID.
User
Account name:
Password:
If you don't have an account you can create one now.
Subject:
HTML doesn't work in the subject.

Message:

 
Notice: This account is set to log the IP addresses of everyone who comments.
Links will be displayed as unclickable URLs to help prevent spam.

Profile

randomfox: (Default)
randomfox

November 2012

S M T W T F S
    123
45678910
11121314151617
18192021222324
25262728 2930 

Most Popular Tags

Style Credit

Expand Cut Tags

No cut tags
Page generated Sep. 26th, 2017 05:32 am
Powered by Dreamwidth Studios