scraper

#! /usr/bin/perl -w
use strict;
#use LWP::Simple;
use LWP::UserAgent;

package NoFace;

#****************************************************************
#    SYNOPSIS:                                                  *
#****************************************************************

# NOFACE is a script to grab strings off arbitrary web pages based on a regular expression

#****************************************************************
#    get_facts FILE

#****************************************************************
#HEADLINES currently cant be called as a method, only internally as a sub
#which is fine.

#    headlines FACT, REGEX, URI, QUERY_STRING;

#    EXAMPLE:
#    my $monster = new NoFace;
#    print $monster -> headlines ('high',  '.*<b CLASS=obsTempTextA>(.d*&deg;F)</b>.*', 'http://www.weather.com/weather/local/11215', 'x=19&lswe=11215&lswa=WeatherLocalUndeclared&GO=GO&whatprefs=&y=7');

#****************************************************************
# METHODS                                                       *
#****************************************************************

################################################################
#if this doesn't make sense, look at the headlines() method
sub get_facts {
shift;
$ARGV[0] = shift;
my @fact;
my %headline;

while (<>) {
if ($_ ne "") {
chomp(@fact = split ('	', $_));  #expects a file with a tab-delimited list on each line
$headline{$fact[0]} =  headlines (@fact);
}
}

return %headline;

}

################################################################

sub headlines {
my $ua = new LWP::UserAgent;
my $webfact;
my ($fact, $regex, $uri, $query_string) = @_;

$ua->agent("MSIE/6.0 " . $ua->agent);
# Create a request
my $req = new HTTP::Request POST => $uri;
#$req->content_type('application/x-www-form-urlencoded');
$req->content($query_string);
# Pass request to the user agent and get a response back
my $res = $ua->request($req);
# Check the outcome of the response
if ($res->is_success) {
my $page = $res->content;
if ( $page =~ s{$regex}{$1}igs ) {
$webfact = $page;
}
else { $webfact = "REGEX ERROR:
$uri
did not match regex:
$regex
";}
} else {
$webfact .= "BROWSER ERROR:
$uri
not found!
";
}
return $webfact;
}

################################################################

sub new
{
my $class = shift;
my $self = {};
#$self->{START_XML_TAG} = "";              #start regex
bless($self, $class);
return $self;
}

return 1;

source

Leave a Reply