00001 #! /usr/bin/perl
00002 # vim:ts=4:sw=4:ai:et:si:sts=4
00003
00004 package BBCLocation;
00005 use strict;
00006 use warnings;
00007 require Exporter;
00008
00009 use utf8;
00010 use encoding 'utf8';
00011 use LWP::UserAgent;
00012 use JSON;
00013 use XML::XPath;
00014 use XML::XPath::XMLParser;
00015 use URI::Escape;
00016
00017
00018 our @EXPORT = qw(Search FindLoc);
00019 our $VERSION = 0.3;
00020
00021 my @searchresults;
00022 my @resulturl;
00023 my $resultcount = -1;
00024
00025 sub Search {
00026 my ($search_string, $dir, $timeout, $logdir) = @_;
00027 $search_string = uri_escape($search_string);
00028
00029 my $base_url = 'http://www.bbc.co.uk/locator/client/weather/en-GB/' .
00030 'search.json';
00031 my $search_url = $base_url . '?ptrt=/&search=';
00032
00033
00034 my $file = $search_string;
00035 getCachedJSON($search_url . $search_string, $dir, $file, $timeout, $logdir);
00036
00037 my $cachefile = "$dir/$file.json";
00038 my $cachefile1 = "$dir/$file-results.html";
00039 my $cachefile2 = "$dir/$file-pagination.html";
00040
00041 open OF, "<:utf8", $cachefile or die "Can't read $cachefile: $!\n";
00042 my $content = do { local $/; <OF> };
00043 close OF;
00044
00045 my $decoded = decode_json $content;
00046 $resultcount = $decoded->{"noOfResults"};
00047
00048 my %loc_hash = ();
00049
00050 get_results($cachefile1, \%loc_hash);
00051
00052 if (exists $decoded->{"pagination"}) {
00053 my %pages = ();
00054 my $xp = XML::XPath->new(filename => $cachefile2);
00055 my $nodeset = $xp->find("//ol/li/a");
00056 foreach my $node ($nodeset->get_nodelist) {
00057 my $url = $node->getAttribute("href");
00058 my $num = $node->string_value;
00059 $url =~ s/&/&/;
00060 $num =~ s/
00061 $pages{$num} = $url;
00062 }
00063
00064 foreach my $page (keys %pages)
00065 {
00066 getCachedJSON($base_url . $pages{$page}, $dir, $file . "-$page",
00067 $timeout, $logdir);
00068
00069 my $cachefile3 = "$dir/$file-$page-results.html";
00070 get_results($cachefile3, \%loc_hash);
00071 }
00072 }
00073
00074 my @searchresults = ();
00075 foreach my $key (keys %loc_hash)
00076 {
00077 my $resultline = $key."::".$loc_hash{$key};
00078 push (@searchresults, $resultline);
00079 }
00080 return @searchresults;
00081 }
00082
00083 sub getCachedJSON {
00084 my ($url, $dir, $file, $timeout, $logdir) = @_;
00085
00086 my $cachefile = "$dir/$file.json";
00087 my $cachefile1 = "$dir/$file-results.html";
00088 my $cachefile2 = "$dir/$file-pagination.html";
00089
00090 my $now = time();
00091 my $decoded;
00092
00093 log_print( $logdir, "Loading URL: $url\n" );
00094
00095 if( (-e $cachefile) and ((stat($cachefile))[9] >= ($now - $timeout)) ) {
00096 # File cache is still recent.
00097 log_print( $logdir, "cached in $cachefile\n" );
00098 } else {
00099 log_print( $logdir, "$url\ncaching to $cachefile\n" );
00100
00101 my $ua = LWP::UserAgent->new;
00102 $ua->timeout(30);
00103 $ua->env_proxy;
00104 $ua->default_header('Accept-Language' => "en");
00105
00106 my $response = $ua->get($url);
00107 if ( !$response->is_success ) {
00108 die $response->status_line;
00109 }
00110
00111 open OF, ">:utf8", $cachefile or die "Can't open $cachefile: $!\n";
00112 print OF $response->content;
00113 close OF;
00114
00115 $decoded = decode_json $response->content;
00116
00117 open OF, ">:utf8", $cachefile1 or die "Can't open $cachefile1: $!\n";
00118 print OF "<html>".$decoded->{"results"}."</html>";
00119 close OF;
00120
00121 if (exists $decoded->{"pagination"}) {
00122 open OF, ">:utf8", $cachefile2 or
00123 die "Can't open $cachefile2: $!\n";
00124 print OF "<html>".$decoded->{"pagination"}."</html>";
00125 close OF;
00126 } else {
00127 unlink $cachefile2;
00128 }
00129 }
00130 }
00131
00132 sub get_results {
00133 my ($file, $outhash) = @_;
00134
00135 my $xp = XML::XPath->new(filename => $file);
00136 my $nodeset = $xp->find("//ul/li/a");
00137 foreach my $node ($nodeset->get_nodelist) {
00138 my $url = $node->getAttribute("href");
00139 my $loc = $node->string_value;
00140
00141 $url =~ s/^\/weather\
00142 $outhash->{$url} = $loc;
00143 }
00144 }
00145
00146 sub log_print {
00147 return if not defined $::opt_D;
00148 my $dir = shift;
00149
00150 open OF, ">>$dir/uk_bbc.log";
00151 print OF @_;
00152 close OF;
00153 }
00154
00155 sub FindLoc {
00156 my ($locid, $dir, $timeout, $logdir) = @_;
00157
00158 my $url = "http://www.bbc.co.uk/weather/$locid";
00159
00160 my $file = "$locid.html";
00161 getCachedHTML($url, $dir, $file, $timeout, $logdir);
00162
00163 my $cachefile = "$dir/$file";
00164
00165 open OF, "<:utf8", $cachefile;
00166 my $contents = do { local $/; <OF>; };
00167 close OF;
00168
00169 my ($rssid) = ($contents =~ /data-loc="(.*?)"/);
00170 die "No RSS Location found for ID $locid!\n" unless defined $rssid;
00171
00172 $rssid =~ s/^LOC-
00173 return $rssid;
00174 }
00175
00176
00177 sub getCachedHTML {
00178 my ($url, $dir, $file, $timeout, $logdir) = @_;
00179
00180 my $cachefile = "$dir/$file";
00181
00182 my $now = time();
00183
00184 log_print( $logdir, "Loading URL: $url\n" );
00185
00186 if( (-e $cachefile) and ((stat($cachefile))[9] >= ($now - $timeout)) ) {
00187 # File cache is still recent.
00188 log_print( $logdir, "cached in $cachefile\n" );
00189 } else {
00190 log_print( $logdir, "$url\ncaching to $cachefile\n" );
00191
00192 my $ua = LWP::UserAgent->new;
00193 $ua->timeout(30);
00194 $ua->env_proxy;
00195 $ua->default_header('Accept-Language' => "en");
00196
00197 my $response = $ua->get($url);
00198 if ( !$response->is_success ) {
00199 die $response->status_line;
00200 }
00201
00202 my $content = $response->content;
00203 open OF, ">:utf8", $cachefile or die "Can't open $cachefile: $!\n";
00204 print OF $content;
00205 close OF;
00206 }
00207 }
00208
00209
00210 1;