randomfox: (Default)
[personal profile] randomfox
These scripts generate a Top N list for users in the 10 Million Photos Flickr group.

ApiKey.pm, a short module containing the API key:


#!perl -w
use strict;

package ApiKey;
use base 'Exporter';
our @EXPORT = qw($api_key $shared_secret $auth_token $groupid);

our $api_key = "...";
our $shared_secret = "...";
our $auth_token = "...";
our $groupid = '20759249@N00';

1;

__END__



tenmil.pl, a script that goes through the whole group counting photos:


#!perl -w
use strict;

use Flickr::API;
use Encode;
use Getopt::Long;

use XML::Simple;
use LWP::UserAgent;
use Time::HiRes qw(usleep);
use FileHandle;

use ApiKey;

my $SLEEPTIME = 500000;

# Query Flickr with retry.
sub FlickrRetry {
    my $api = shift;
    my $method = shift;
    my $param = shift;

    my $retry_count = 0;
    my $response;
    do {
	$response = $api->execute_method($method, $param);
	usleep $SLEEPTIME;
    } while $retry_count++ < 5 and not $response->{success};
    $response;
}

# Generate a list of photos from the Flickr query response.
sub GenPhotoList {
    my $response = shift;

    my $xmlp = new XML::Simple;
    my $xm = $xmlp->XMLin($response->{_content}, forcearray=>['photo']);

    my $photos = $xm->{photos};
    print "Page $photos->{page} of $photos->{pages}...\n";

    my $photolist = $photos->{photo};
    my @photoarr;

    for my $id (keys %{$photolist}) {
	my $photo = $photolist->{$id};
	$photo->{id} = $id;
	$photo->{url} = "http://www.flickr.com/photos/$photo->{owner}/$photo->{id}";
	push @photoarr, $photo;
    }
    ( $photos->{pages}, \@photoarr );
}

# Open log file.
sub OpenLog {
    my $prefix = shift;
    my $logfn = sprintf("$prefix%X.htm", time);
    my $fh = new FileHandle $logfn, "w";
    defined $fh or die "Error opening $logfn for writing: $!\n";
    $fh->autoflush(1);
    $fh;
}

sub CountPhotos {
    my $groupid = shift;
    my $pagenum = shift;
    my $pagelen = shift;

    my $ownercounts = shift;
    my $ownernames = shift;

    my $api = new Flickr::API({'key' => $api_key, secret => $shared_secret});

    my $response = FlickrRetry($api, "flickr.groups.pools.getPhotos",
	{
	    group_id => $groupid,
	    per_page => $pagelen,
	    page => $pagenum,
	    auth_token => $auth_token
	});
    die "Error: $response->{error_message}\n" unless $response->{success};

    my ($totalpages, $photolist) = GenPhotoList($response);

    for my $photo (@$photolist) {
	my $owner = $photo->{owner};
	unless (defined $ownercounts->{$owner}) {
	    $ownernames->{$owner} = encode("iso-8859-1", $photo->{ownername});
	    $ownercounts->{$owner} = 0;
	}
	$ownercounts->{$owner}++;
    }

    return $totalpages;
}

sub Usage {
    print <<EOM;
$0 [-pagelen=n] startpagenum endpagenum

-pagelen=n: 
	Set the page length to n. 
	n must be at least 1. 
	Default n is 500.
EOM
    exit 1;
}

my %ownercounts;
my %ownernames;

my $pagelen = 500;

Getopt::Long::Configure("bundling_override");
GetOptions('pagelen=s' => \$pagelen) or Usage();
die "Page length must be at least 1\n" if $pagelen < 1;

my $startpagenum = (shift or 1);
my $endpagenum = (shift or $startpagenum);

$endpagenum < $startpagenum and
    ($startpagenum, $endpagenum) = ($endpagenum, $startpagenum);

print "Scanning pages $startpagenum to $endpagenum...\n";

for my $pagenum ($startpagenum .. $endpagenum) {
    my $totalpages = CountPhotos($groupid, $pagenum, $pagelen,
	\%ownercounts, \%ownernames);
    last if $pagenum >= $totalpages;
}

my $logfh = OpenLog("x");
print $logfh "Pages $startpagenum to $endpagenum:\n";

for my $key (sort { $ownercounts{$b} <=> $ownercounts{$a} } keys %ownercounts) {
    print $logfh "$ownercounts{$key},$key,$ownernames{$key}\n";
}
$logfh->close;

__END__



runten.pl, a wrapper that runs tenmil.pl block by block, retrying if necessary:


#!perl -w
use strict;

use FileHandle;
use File::DosGlob;

@ARGV < 1 and die "Usage: $0 pool-size\n";

my $poolsize = shift;
my $pagecount = int(($poolsize + 499) / 500);

my $topset = int(($pagecount + 9) / 10) * 10;

for (;;) {
    # Go through the whole list of files every time in case the most recent
    # run of tenmil.pl failed.
    my @filelist = File::DosGlob::glob "x*.htm";
    for my $file (@filelist) {
	my $fh = new FileHandle $file, "r";
	my $firstline = <$fh>;
	if ($firstline =~ /^Pages \d+ to (\d+):/) {
	    $1 <= $topset and $topset = $1 - 10;
	}
	$fh->close;
    }

    last if $topset <= 0;

    my $startset = $topset - 9;
    system "tenmil.pl $startset $topset";
}

__END__



tenshort.pl, a script that merges the output from tenmil.pl and updates counts for the top 150 users:


#!perl -w
use strict;

use FileHandle;
use File::DosGlob;

use Flickr::API;
use Encode;

use XML::Simple;
use LWP::UserAgent;
use Time::HiRes qw(usleep);

use ApiKey;

my $SLEEPTIME = 500000;

# Query Flickr with retry.
sub FlickrRetry {
    my $api = shift;
    my $method = shift;
    my $param = shift;

    my $retry_count = 0;
    my $response;
    do {
	$response = $api->execute_method($method, $param);
	usleep $SLEEPTIME;
    } while $retry_count++ < 5 and not $response->{success};
    $response;
}

sub glob_args {
    map { File::DosGlob::glob $_ } @_;
}

sub GetCount {
    my $groupid = shift;
    my $userid = shift;

    my $api = new Flickr::API({'key' => $api_key, secret => $shared_secret});

    my $response = FlickrRetry($api, "flickr.groups.pools.getPhotos",
	{
	    group_id => $groupid,
	    user_id => $userid,
	    per_page => 1,
	    auth_token => $auth_token
	});
    die "Error: $response->{error_message}\n" unless $response->{success};

    my $xmlp = new XML::Simple;
    my $xm = $xmlp->XMLin($response->{_content});
    my $photos = $xm->{photos};

    $photos->{pages};
}


my %owners;

@ARGV = glob_args @ARGV;

while (<>) {
    chomp;
    next if /^Pages /;
    my ($count, $owner, $ownername) = split(/,/, $_, 3);

    defined $ownername or next;

    unless (defined $owners{$owner}) {
	$owners{$owner} = { owner => $owner, name => $ownername, count => 0 }
    }

    $owners{$owner}{count} += $count;
}

my @topusers = (sort { $b->{count} <=> $a->{count} } values %owners) [0..149];

my $lineno = 0;
for my $user (@topusers) {
    $lineno++;
    print "$lineno. Getting photocount for user $user->{name}...\n";

    $user->{count} = GetCount($groupid, $user->{owner});
}

@topusers = sort { $b->{count} <=> $a->{count} } @topusers;

my $fh = new FileHandle "newcount.txt", "w";
defined $fh or die "Can't open newcount.txt for writing: $!\n";

$lineno = 0;
for my $user (@topusers) {
    $lineno++;
    print $fh "$lineno. $user->{name}: $user->{count}\n";
}

$fh->close;

__END__

From:
Anonymous( )Anonymous This account has disabled anonymous posting.
OpenID( )OpenID You can comment on this post while signed in with an account from many other sites, once you have confirmed your email address. Sign in using OpenID.
User
Account name:
Password:
If you don't have an account you can create one now.
Subject:
HTML doesn't work in the subject.

Message:

 
Notice: This account is set to log the IP addresses of everyone who comments.
Links will be displayed as unclickable URLs to help prevent spam.

Profile

randomfox: (Default)
randomfox

November 2012

S M T W T F S
    123
45678910
11121314151617
18192021222324
25262728 2930 

Most Popular Tags

Style Credit

Expand Cut Tags

No cut tags
Page generated Jul. 23rd, 2017 10:39 am
Powered by Dreamwidth Studios