#! /usr/bin/perl -w
use strict;
#use LWP::Simple;
use LWP::UserAgent;
package NoFace;
#****************************************************************
# SYNOPSIS: *
#****************************************************************
# NOFACE is a script to grab strings off arbitrary web pages based on a regular expression
#****************************************************************
# get_facts FILE
#****************************************************************
#HEADLINES currently cant be called as a method, only internally as a sub
#which is fine.
# headlines FACT, REGEX, URI, QUERY_STRING;
# EXAMPLE:
# my $monster = new NoFace;
# print $monster -> headlines ('high', '.*<b CLASS=obsTempTextA>(.d*°F)</b>.*', 'http://www.weather.com/weather/local/11215', 'x=19&lswe=11215&lswa=WeatherLocalUndeclared&GO=GO&whatprefs=&y=7');
#****************************************************************
# METHODS *
#****************************************************************
################################################################
#if this doesn't make sense, look at the headlines() method
sub get_facts {
shift;
$ARGV[0] = shift;
my @fact;
my %headline;
while (<>) {
if ($_ ne "") {
chomp(@fact = split (' ', $_)); #expects a file with a tab-delimited list on each line
$headline{$fact[0]} = headlines (@fact);
}
}
return %headline;
}
################################################################
sub headlines {
my $ua = new LWP::UserAgent;
my $webfact;
my ($fact, $regex, $uri, $query_string) = @_;
$ua->agent("MSIE/6.0 " . $ua->agent);
# Create a request
my $req = new HTTP::Request POST => $uri;
#$req->content_type('application/x-www-form-urlencoded');
$req->content($query_string);
# Pass request to the user agent and get a response back
my $res = $ua->request($req);
# Check the outcome of the response
if ($res->is_success) {
my $page = $res->content;
if ( $page =~ s{$regex}{$1}igs ) {
$webfact = $page;
}
else { $webfact = "REGEX ERROR:
$uri
did not match regex:
$regex
";}
} else {
$webfact .= "BROWSER ERROR:
$uri
not found!
";
}
return $webfact;
}
################################################################
sub new
{
my $class = shift;
my $self = {};
#$self->{START_XML_TAG} = ""; #start regex
bless($self, $class);
return $self;
}
return 1;
scraper
Leave a Reply
You must be logged in to post a comment.