#! /usr/bin/perl -w
use strict;
use HTTP::Request::Common;
use LWP::UserAgent;
#The Perl Black Book, Holzner, p. 1247
=item how to call submit_query():
#Method 1.
my %example = (test1 => 'noah', test2 => 'sussman');
submit_query("http://suburbanangst.com/reg.php", %example);
#Method 2.
submit_query("http://suburbanangst.com/reg.php?test1=foo&test2=bar;");
=cut
sub submit_query {
my ($file, %query) = @_;
my $user_agent = LWP::UserAgent->new;
$user_agent->agent("MSIE/5.5 " . $user_agent->agent);
my $request = POST
#'http://suburbanangst.com/reg.php',
$file,
[%query];
my $response = $user_agent->request($request);
print $response->as_string;
}
Tag Archive for agent
submit a POST query with parameters
scraper
#! /usr/bin/perl -w
use strict;
#use LWP::Simple;
use LWP::UserAgent;
package NoFace;
#****************************************************************
# SYNOPSIS: *
#****************************************************************
# NOFACE is a script to grab strings off arbitrary web pages based on a regular expression
#****************************************************************
# get_facts FILE
#****************************************************************
#HEADLINES currently cant be called as a method, only internally as a sub
#which is fine.
# headlines FACT, REGEX, URI, QUERY_STRING;
# EXAMPLE:
# my $monster = new NoFace;
# print $monster -> headlines ('high', '.*<b CLASS=obsTempTextA>(.d*°F)</b>.*', 'http://www.weather.com/weather/local/11215', 'x=19&lswe=11215&lswa=WeatherLocalUndeclared&GO=GO&whatprefs=&y=7');
#****************************************************************
# METHODS *
#****************************************************************
################################################################
#if this doesn't make sense, look at the headlines() method
sub get_facts {
shift;
$ARGV[0] = shift;
my @fact;
my %headline;
while (<>) {
if ($_ ne "") {
chomp(@fact = split (' ', $_)); #expects a file with a tab-delimited list on each line
$headline{$fact[0]} = headlines (@fact);
}
}
return %headline;
}
################################################################
sub headlines {
my $ua = new LWP::UserAgent;
my $webfact;
my ($fact, $regex, $uri, $query_string) = @_;
$ua->agent("MSIE/6.0 " . $ua->agent);
# Create a request
my $req = new HTTP::Request POST => $uri;
#$req->content_type('application/x-www-form-urlencoded');
$req->content($query_string);
# Pass request to the user agent and get a response back
my $res = $ua->request($req);
# Check the outcome of the response
if ($res->is_success) {
my $page = $res->content;
if ( $page =~ s{$regex}{$1}igs ) {
$webfact = $page;
}
else { $webfact = "REGEX ERROR:
$uri
did not match regex:
$regex
";}
} else {
$webfact .= "BROWSER ERROR:
$uri
not found!
";
}
return $webfact;
}
################################################################
sub new
{
my $class = shift;
my $self = {};
#$self->{START_XML_TAG} = ""; #start regex
bless($self, $class);
return $self;
}
return 1;
Grab linked files from a list of web pages
#!/usr/local/bin/perl -w
use strict;
use LWP::Simple;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI::URL;
#MAIN----------------------------------------------------------------------------
#Extract info from the tags in these files:
my $tag_type; #Extract info from this type of tag ONLY (ok to use | here)
my $local_directory; #Save files here
my $extensions; #Only save files with these extensions (ok to use | here)
$extensions = &grab_what(); #Let the user choose what type of files to grab
#Need to prefix the sub with "&" here or perl thinks its a call to a prototype and gives a warning
# $ARGV[0] = "c:/windows/desktop/list.txt"; #List of urls to search for files
$local_directory = "c:/windows/desktop/grabit/"; #Store grabbed files here
$tag_type = "a"; #Look in <A> tags for file URIs
#
die "
*******************
ERROR
*******************
Please create the directory:
$local_directory
" unless -d $local_directory; #unless local really is a directory...
#
while (<>) { #Assume we are reading a file with one URL on each line
chomp(my $url = $_);
if ($url ne "") {
grabit($_, $tag_type, $local_directory, $extensions);
print $url . "
";
#print " "; #delete urls from list file once they've been grabbed
}
}
#GRABIT--------------------------------------------------------------------------
#Just a wrapper for grab_hyperlink
#makes it easier to call g-h iteratively
sub grabit {
my ($url, $tag_type, $local_directory, $extensions) = @_;
grab_hyperlinked($url, $tag_type, $local_directory, $extensions);
}
#GRAB_HYPERLINKED----------------------------------------------------------------
#Search the file at URL for tags of type TAG_TYPE and grab those targets that end with arbitrarily chosen EXTENSIONS
sub grab_hyperlinked {
my ($url, $tag_type, $local_directory, $extensions) = @_;
my @links = list_links($url, $tag_type);
#@links = @links[0 .. 7]; #only get the first X images (or comment this out to get all)
foreach my $image_uri (@links) {
next if $image_uri eq "";
if ($image_uri =~ m{.($extensions)$}io){ #Only save files with the specified extensions
my $image_name = $image_uri;
$image_name =~ s{.*/(.*)}{$1};
$image_name = smart_save($image_name, $local_directory); #Don't overwrite files with same name (obviously, either this line should be commented out, or the one below it should be)
save_image($image_uri, $local_directory . $image_name);
#print $image_uri;
}
}
}
#SMART SAVE------------------------------------------------------------------------------
#This script checks to see if the file FILE_NAME already exists in DIRECTORY
#and if so, adds an integer to the end of the file's name, before the extension
#ie, if there are 2 files named foo.bar, then the second one to be saved will be renamed foo-1.bar
#The RETURN VALUE is the new name of the file.
sub smart_save {
my ($file_name, $directory) = @_;
my $int = 0;
my $ext = $file_name;
while (-e $directory . $file_name) {
$ext =~ s{[^.]*(.*)}{$1}; #extension of file_name
$file_name =~ s{([^.]*).*}{$1}; #file_name minus exension
while (-e $directory . $file_name . "-" . $int . $ext) {
$int++;
}
$file_name = $file_name . "-" . $int . $ext; #returns foo-1.bar
}
return $file_name;
}
#SAVE IMAGE------------------------------------------------------------------------------
#This script will grab an image from a web page and save it locally
#file = 'http://localhost/libraries/images/oiltower/top_boom.jpg'; #This is the name of the image on the server
#my $download = 'c:windowsdesktopgrabbed.jpg'; #This is where the image will be saved locally
#save_image($file, $download);
sub save_image { #copy web FILE to local DOWNLOAD location
my ($file, $download) = @_;
my $user_agent = LWP::UserAgent->new;
my $request = HTTP::Request->new('GET', $file);
my $response = $user_agent->request ($request, $download);
}
#LIST LINKS---------------------------------------------------------------------
#Extract the URL information from all links on the page, filtering out links that do not go to GIFS or JPEGS
#Returns an array containing the full paths of each of the images
#This code is adapted from the HTML::LinkExtor docs
#my $temp = "c:/windows/desktop/grabit.temp";
#my $url = "http://localhost/lwp/pics.html"; # for instance
#my @links = list_links($url, $temp);
sub list_links {
my ($url, $tag_type) = @_;
my $user_agent = new LWP::UserAgent;
#$user_agent->agent("MSIE/5.5 " . $user_agent->agent);
# Set up a callback that collect image links
my @images = ();
#
# Make the parser. Unfortunately, we don't know the base yet
# (it might be diffent from $url)
# my $p = HTML::LinkExtor->new(&callback);
my $p = HTML::LinkExtor->new(
sub {
my($tag, %attributes) = @_;
return if $tag ne $tag_type ; # we only look closer at the tags specified by TAG_TYPE
push(@images, values %attributes);
}
);
#
# Request document and parse it as it arrives
my $response = $user_agent->request(HTTP::Request->new(GET => $url),
sub {$p->parse($_[0])});
#
# Expand all image URLs to absolute ones
my $base = $response->base;
@images = map { $_ = url($_, $base)->abs; } @images;
#
# Print them out
#print join("
", @images), "
";
return @images;
}
#******************************************************************************
#* *
#* USER-QUERY FUNCTIONS: *
#* *
#******************************************************************************
#GRAB WHAT?--------------------------------------------------------------------
#Let the user choose what type(s) of files to grab
sub grab_what(){
my $option_id = 1;
my $selection;
my @extensions = qw(
jpg|gif|mpg
wav|zip
zip
wav
mp3
);
print "Welcome to Grabit by Noah Sussman
";
foreach my $ext (@extensions){
print "$option_id) $ext
";
$option_id++;
}
print "
What type(s) of files would you like to grab?
";
chomp($selection = <STDIN>);
die "You must enter a number corresponding to an option!!" unless ($extensions[$selection - 1] ne "");
print "Extension set to "$extensions[$selection - 1]".
Grabbing...
";
return $extensions[$selection - 1];
}
##############################
##############################
##############################
##############################
##############################
##############################
##############################
##########END#################
##############################
##############################
##############################
##############################
##############################
##############################
##############################
Category: Uncategorized |
Tags: 2001, agent, automation, beginner, browser, commandline, download, easy, file, filter, grabit, http, interactive, lwp, perl, Shell, simple, text, web