randomfox: (Default)
[personal profile] randomfox
These scripts generate a Top N list for users in the 10 Million Photos Flickr group.

ApiKey.pm, a short module containing the API key:


#!perl -w
use strict;

package ApiKey;
use base 'Exporter';
our @EXPORT = qw($api_key $shared_secret $auth_token $groupid);

our $api_key = "...";
our $shared_secret = "...";
our $auth_token = "...";
our $groupid = '20759249@N00';

1;

__END__



tenmil.pl, a script that goes through the whole group counting photos:


#!perl -w
use strict;

use Flickr::API;
use Encode;
use Getopt::Long;

use XML::Simple;
use LWP::UserAgent;
use Time::HiRes qw(usleep);
use FileHandle;

use ApiKey;

my $SLEEPTIME = 500000;

# Query Flickr with retry.
sub FlickrRetry {
    my $api = shift;
    my $method = shift;
    my $param = shift;

    my $retry_count = 0;
    my $response;
    do {
	$response = $api->execute_method($method, $param);
	usleep $SLEEPTIME;
    } while $retry_count++ < 5 and not $response->{success};
    $response;
}

# Generate a list of photos from the Flickr query response.
sub GenPhotoList {
    my $response = shift;

    my $xmlp = new XML::Simple;
    my $xm = $xmlp->XMLin($response->{_content}, forcearray=>['photo']);

    my $photos = $xm->{photos};
    print "Page $photos->{page} of $photos->{pages}...\n";

    my $photolist = $photos->{photo};
    my @photoarr;

    for my $id (keys %{$photolist}) {
	my $photo = $photolist->{$id};
	$photo->{id} = $id;
	$photo->{url} = "http://www.flickr.com/photos/$photo->{owner}/$photo->{id}";
	push @photoarr, $photo;
    }
    ( $photos->{pages}, \@photoarr );
}

# Open log file.
sub OpenLog {
    my $prefix = shift;
    my $logfn = sprintf("$prefix%X.htm", time);
    my $fh = new FileHandle $logfn, "w";
    defined $fh or die "Error opening $logfn for writing: $!\n";
    $fh->autoflush(1);
    $fh;
}

sub CountPhotos {
    my $groupid = shift;
    my $pagenum = shift;
    my $pagelen = shift;

    my $ownercounts = shift;
    my $ownernames = shift;

    my $api = new Flickr::API({'key' => $api_key, secret => $shared_secret});

    my $response = FlickrRetry($api, "flickr.groups.pools.getPhotos",
	{
	    group_id => $groupid,
	    per_page => $pagelen,
	    page => $pagenum,
	    auth_token => $auth_token
	});
    die "Error: $response->{error_message}\n" unless $response->{success};

    my ($totalpages, $photolist) = GenPhotoList($response);

    for my $photo (@$photolist) {
	my $owner = $photo->{owner};
	unless (defined $ownercounts->{$owner}) {
	    $ownernames->{$owner} = encode("iso-8859-1", $photo->{ownername});
	    $ownercounts->{$owner} = 0;
	}
	$ownercounts->{$owner}++;
    }

    return $totalpages;
}

sub Usage {
    print <<EOM;
$0 [-pagelen=n] startpagenum endpagenum

-pagelen=n: 
	Set the page length to n. 
	n must be at least 1. 
	Default n is 500.
EOM
    exit 1;
}

my %ownercounts;
my %ownernames;

my $pagelen = 500;

Getopt::Long::Configure("bundling_override");
GetOptions('pagelen=s' => \$pagelen) or Usage();
die "Page length must be at least 1\n" if $pagelen < 1;

my $startpagenum = (shift or 1);
my $endpagenum = (shift or $startpagenum);

$endpagenum < $startpagenum and
    ($startpagenum, $endpagenum) = ($endpagenum, $startpagenum);

print "Scanning pages $startpagenum to $endpagenum...\n";

for my $pagenum ($startpagenum .. $endpagenum) {
    my $totalpages = CountPhotos($groupid, $pagenum, $pagelen,
	\%ownercounts, \%ownernames);
    last if $pagenum >= $totalpages;
}

my $logfh = OpenLog("x");
print $logfh "Pages $startpagenum to $endpagenum:\n";

for my $key (sort { $ownercounts{$b} <=> $ownercounts{$a} } keys %ownercounts) {
    print $logfh "$ownercounts{$key},$key,$ownernames{$key}\n";
}
$logfh->close;

__END__



runten.pl, a wrapper that runs tenmil.pl block by block, retrying if necessary:


#!perl -w
use strict;

use FileHandle;
use File::DosGlob;

@ARGV < 1 and die "Usage: $0 pool-size\n";

my $poolsize = shift;
my $pagecount = int(($poolsize + 499) / 500);

my $topset = int(($pagecount + 9) / 10) * 10;

for (;;) {
    # Go through the whole list of files every time in case the most recent
    # run of tenmil.pl failed.
    my @filelist = File::DosGlob::glob "x*.htm";
    for my $file (@filelist) {
	my $fh = new FileHandle $file, "r";
	my $firstline = <$fh>;
	if ($firstline =~ /^Pages \d+ to (\d+):/) {
	    $1 <= $topset and $topset = $1 - 10;
	}
	$fh->close;
    }

    last if $topset <= 0;

    my $startset = $topset - 9;
    system "tenmil.pl $startset $topset";
}

__END__



tenshort.pl, a script that merges the output from tenmil.pl and updates counts for the top 150 users:


#!perl -w
use strict;

use FileHandle;
use File::DosGlob;

use Flickr::API;
use Encode;

use XML::Simple;
use LWP::UserAgent;
use Time::HiRes qw(usleep);

use ApiKey;

my $SLEEPTIME = 500000;

# Query Flickr with retry.
sub FlickrRetry {
    my $api = shift;
    my $method = shift;
    my $param = shift;

    my $retry_count = 0;
    my $response;
    do {
	$response = $api->execute_method($method, $param);
	usleep $SLEEPTIME;
    } while $retry_count++ < 5 and not $response->{success};
    $response;
}

sub glob_args {
    map { File::DosGlob::glob $_ } @_;
}

sub GetCount {
    my $groupid = shift;
    my $userid = shift;

    my $api = new Flickr::API({'key' => $api_key, secret => $shared_secret});

    my $response = FlickrRetry($api, "flickr.groups.pools.getPhotos",
	{
	    group_id => $groupid,
	    user_id => $userid,
	    per_page => 1,
	    auth_token => $auth_token
	});
    die "Error: $response->{error_message}\n" unless $response->{success};

    my $xmlp = new XML::Simple;
    my $xm = $xmlp->XMLin($response->{_content});
    my $photos = $xm->{photos};

    $photos->{pages};
}


my %owners;

@ARGV = glob_args @ARGV;

while (<>) {
    chomp;
    next if /^Pages /;
    my ($count, $owner, $ownername) = split(/,/, $_, 3);

    defined $ownername or next;

    unless (defined $owners{$owner}) {
	$owners{$owner} = { owner => $owner, name => $ownername, count => 0 }
    }

    $owners{$owner}{count} += $count;
}

my @topusers = (sort { $b->{count} <=> $a->{count} } values %owners) [0..149];

my $lineno = 0;
for my $user (@topusers) {
    $lineno++;
    print "$lineno. Getting photocount for user $user->{name}...\n";

    $user->{count} = GetCount($groupid, $user->{owner});
}

@topusers = sort { $b->{count} <=> $a->{count} } @topusers;

my $fh = new FileHandle "newcount.txt", "w";
defined $fh or die "Can't open newcount.txt for writing: $!\n";

$lineno = 0;
for my $user (@topusers) {
    $lineno++;
    print $fh "$lineno. $user->{name}: $user->{count}\n";
}

$fh->close;

__END__

Profile

randomfox: (Default)
randomfox

November 2012

S M T W T F S
    123
45678910
11121314151617
18192021222324
25262728 2930 

Most Popular Tags

Style Credit

Expand Cut Tags

No cut tags
Page generated Sep. 26th, 2017 05:44 am
Powered by Dreamwidth Studios