#!/usr/bin/env perl
use warnings;
use strict;
use Getopt::Std;
use PDF::FromImage;
our %opts;
getopt('d:f:o:', %opts);
if (!%opts) {
print STDERR << "EOF";
Usage: [-d directory][-f image file] -o output.pdf
EOF
exit;
}
elsif ($opts{d}) {
my $pdf = PDF::FromImage->new;
my @files = <$opts{d}/*.tif>;
my @rfiles = reverse(@files);
$pdf->load_images(@rfiles);
$pdf->write_file("$opts{o}");
}
if ($opts{f}) {
my $pdf = PDF::FromImage->new;
$pdf->load_images("$opts{f}");
$pdf->write_file("$opts{o}");
}
Tag Archive for perl
Tiff2PDF converter for NYBC Files
Apache PHP MAMP デスクトップã§CGIã‚’å‹•ã‹ã™httpd
# # ScriptAlias: This controls which directories contain server scripts. # ScriptAliases are essentially the same as Aliases, except that # documents in the realname directory are treated as applications and # run by the server when requested rather than as documents sent to the client. # The same rules about trailing "/" apply to ScriptAlias directives as to # Alias. # ScriptAlias /cgi-bin/ "/Applications/MAMP/cgi-bin/" <IfModule mod_cgid.c> # # Additional to mod_cgid.c settings, mod_cgid has Scriptsock <path> # for setting UNIX socket for communicating with cgid. # #Scriptsock logs/cgisock </IfModule> # # "/Applications/MAMP/cgi-bin" should be changed to whatever your ScriptAliased # CGI directory exists, if you have that configured. # <Directory "/Applications/MAMP/cgi-bin"> AllowOverride None Options None Order allow,deny Allow from all </Directory> # sub dir <Directory "/Users/*/Desktop"> Options ExecCGI AddHandler cgi-script .cgi .pl </Directory>
Get Your War On Scrape to RSS
#!/usr/bin/perl
use HTML::Entities;
use LWP::Simple;
# print a feed header
print "<?xml version="1.0" encoding="ISO-8859-1"?>
".
"<rdf:RDF
".
"xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
".
"xmlns:content="http://purl.org/rss/1.0/modules/content/"
".
"xmlns="http://my.netscape.com/rdf/simple/0.9/">
".
"<channel>
".
" <title>Get Your War On</title>
".
" <link>http://www.mnftiu.cc/mnftiu.cc/war.html</link>
".
" <description>A webcomic about our 9/11 epilogue.</description>
".
"</channel>
";
$html_string = get ("http://www.mnftiu.cc/mnftiu.cc/war.html");
$i = 2;
while ($html_string =~ m/<a href="war(d|dd).html">(d|dd)</a>/g)
{
$i++
}
$url = "http://www.mnftiu.cc/mnftiu.cc/war" . $i . ".html";
$html_string = get ($url);
while ($html_string =~ m/<img src="images/gywo.(.*?).gif" border=0>/g)
{
print "<item>
".
"<title>" . $1 . "</title>
".
"<link>" . $url . "</link>
".
"<description><img src="http://www.mnftiu.cc/mnftiu.cc/images/gywo." . $1 . ".gif"></description>
";
print "</item>
";
}
print "</rdf:RDF>
";
Broken Gallup Election 2008 Polls Scrape to RSS
#!/usr/bin/perl -w
use strict;
use HTML::TreeBuilder;
use XML::RSS;
use LWP::Parallel::UserAgent;
use LWP::Simple;
my @images;
my $pua = LWP::Parallel::UserAgent->new();
$pua->agent('Mozilla/5.0 (X11; U; Butt Linux i686; en-US; rv:1.9.0.1) Gecko/2008072820 Firefox/3.0.1');
my $base_url = "http://www.gallup.com/poll/election2008.aspx";
my $base_tree = HTML::TreeBuilder->new_from_content(
get($base_url)
);
foreach my $element ($base_tree->look_down("_tag", "div", "class", "section electionanalyses")) {
if (my $img_element = $element->look_down("_tag", "img")) {
push(@images, $img_element->attr_get_i("src"));
}
}
my %trees;
foreach my $element ($base_tree->look_down("_tag", "div", "class", "section electionanalyses topics")->look_down("_tag", "a")) {
$trees{$element->attr_get_i("href")} = "";
}
foreach my $topic_url (keys(%trees)) {
my $request = HTTP::Request->new();
$request->uri($topic_url);
$request->method("GET");
$pua->register($request);
}
my $html = $pua->wait();
foreach my $entry (values(%$html)) {
my $tree = HTML::TreeBuilder->new_from_content(
$entry->response()->content()
);
push(@images, $tree->look_down("_tag", "div", "class", "cmsbody")->look_down("_tag", "img")->attr_get_i("src"));
}
my $rss = XML::RSS->new();
$rss->channel( title => "Gallop Election 2008 Polls",
link => $base_url,
language => "en",
description => "Gallup has studied human nature and behavior for more than 70 years. Gallup's reputation for delivering relevant, timely, and visionary research on what people around the world think and feel is the cornerstone of the organization. Gallup employs many of the world's leading scientists in management, economics, psychology, and sociology, and our consultants assist leaders in identifying and monitoring behavioral economic indicators worldwide. Gallup consultants help organizations boost organic growth by increasing customer engagement and maximizing employee productivity through measurement tools, coursework, and strategic advisory services. Gallup's 2,000 professionals deliver services at client organizations, through the Web, at Gallup University's campuses, and in 40 offices around the world.",
ttl => "300"
);
use Date::Manip;
my $date = UnixDate(ParseDate(`date`), "%g");
my $description = "";
foreach my $img (@images) {
$description .= "<img src="$img" /><br />
";
}
$rss->add_item(
title => "Gallup Polls - $date",
description => $description,
link => $base_url,
pubDate => $date,
category => "politics",
author => "Gallup, Incorporated",
);
print $rss->as_string();
Found Magazine RSS Tidy Up
#!/usr/bin/perl -w
use strict;
use LWP::Simple;
my $feed = get("http://foundmagazine.com/feed/news");
$feed =~ s//thumb///full//g;
print $feed;
Engrish Scrape to RSS
#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use XML::RSS;
use HTML::TreeBuilder 3.0;
use Date::Manip;
my $rss = XML::RSS->new(version => '2.0');
$rss->channel( title => "Engrish",
link => "http://www.engrish.com/",
language => "en",
description => "",
webmaster => "",
ttl => "300"
);
$rss->skipDays(
# day => "Sunday",
# day => "Monday",
# day => "Tuesday",
# day => "Wednesday",
# day => "Thursday",
# day => "Friday",
# day => "Saturday"
);
# Some kind of loop here to grab feeds.
my $recent = HTML::TreeBuilder->new_from_content(
get("http://www.engrish.com/recent.php"));
my @pictures;
foreach my $tdtag ($recent->look_down("_tag", "td", "bgcolor", "#CCCCCC", "width", "120", "height", "120")) {
push(@pictures, $tdtag->look_down("_tag", "a")->attr_get_i("href"));
}
foreach my $picture (@pictures) {
(my $date) = ($picture =~ m/date=(.*?)$/);
$date = UnixDate(ParseDate($date), "%g");
my $item = HTML::TreeBuilder->new_from_content(get("http://www.engrish.com/$picture"));
my $image = $item->look_down("_tag", "img", "src", qr/image/engrish//);
$image->attr("src", "http://www.engrish.com/" . $image->attr_get_i("src"));
$image = $image->as_HTML();
my $caption = $item->look_down("_tag", "em")->as_text();
my $title = $item->look_down("_tag", "title")->as_text();
my $quip = $item->look_down("_tag", "font", "face", "Times New Roman, Times, serif")->as_text();
($quip) = ($quip =~ m/(.*?)Photo/);
$rss->add_item( title => "$title",
description => "<![CDATA[<p align="center">$quip<br />$image<br />$caption</p>",
link => "http://www.engrish.com/$picture",
pubDate => "$date",
category => "humor",
author => "",
);
}
print $rss->as_string();
APOD Scrape to RSS
#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use LWP::Parallel::UserAgent;
use XML::RSS;
use HTML::TreeBuilder 3.0;
use Date::Manip;
my $rss = XML::RSS->new(version => '2.0');
my $url = "http://antwrp.gsfc.nasa.gov/apod/";
my $pua = LWP::Parallel::UserAgent->new();
$rss->channel( title => "Astronomy Picture of the Day",
link => $url,
language => "en",
description => "Discover the cosmos! Each day a different image or photograph of our fascinating universe is featured, along with a brief explanation written by a professional astronomer.",
webmaster => "",
ttl => "480"
);
$rss->skipDays(
# day => "Sunday",
# day => "Monday",
# day => "Tuesday",
# day => "Wednesday",
# day => "Thursday",
# day => "Friday",
# day => "Saturday"
);
my $archive = get("http://antwrp.gsfc.nasa.gov/apod/archivepix.html");
my @pages;
my %dates;
my %titles;
while ($archive =~ m/(.*?): <a href="(.*?)">(.*?)</a>/g) {
$dates{$url . $2} = $1;
$titles{$url . $2} = $3;
push(@pages, $2);
}
for (my $i = 0; $i < 20; $i++) {
my $link = $url . $pages[$i];
my $httprequest = HTTP::Request->new();
$httprequest->uri($link);
$httprequest->method("GET");
$pua->register($httprequest);
}
my $html = $pua->wait();
foreach my $pod (values(%$html)) {
my $response = $pod->response();
my $podurl = $response->base()->as_string();
my $article = HTML::TreeBuilder->new_from_content(
$response->content()
);
my @chunks = $article->look_down("_tag", "p");
my $description = $chunks[1]->as_HTML() . $chunks[2]->as_HTML();
my $date = UnixDate(ParseDate($dates{$podurl}), "%g");
$rss->add_item( title => "$titles{$podurl}",
description => "$description",
link => "$podurl",
pubDate => "$date",
category => "science",
author => "Robert Nemiroff, Jerry Bonnell",
);
}
print $rss->as_string();
Achewood Scrape to RSS
#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use XML::RSS;
use HTML::TreeBuilder 3.0;
use Date::Manip;
my $rss = XML::RSS->new(version => '2.0');
$rss->channel(
title => "Achewood",
link => "http://www.achewood.com/",
language => "en",
description => ""Achewood," like wormwood, was used by antebellum slaves in the production of "achewater," a long-since outmoded and outlawed Southern beverage.
Drinkers of achewater experienced hallucinations
and euphoria, but the after-effects of the liquor produced a deep and lasting melancholy (hence its name).
Modern science has confirmed that achewood oil, the active ingredient in achewater, is a powerful depressant which causes irreversible neurological damage.
Achewater is generally thought to have inspired many Southern folk songs and fables, such as "The Story of Poor John Ritch," "Sullivan's Bear and Dried Bird" and "I'm Following a Little Round Lord."",
webmaster => "chris@achewood.com",
ttl => "300"
);
$rss->skipDays(
day => "Sunday",
# day => "Monday",
# day => "Tuesday",
# day => "Wednesday",
# day => "Thursday",
# day => "Friday",
day => "Saturday"
);
my $url = "http://m.assetbar.com/achewood/";
my $tree = HTML::TreeBuilder->new_from_content(get($url));
my $element = $tree->look_down("_tag", "span", "class", "date");
my $date = UnixDate(ParseDate($element->as_text()), "%g");
$element = $tree->find("title");
my $title = $element->as_text();
$element = $tree->look_down("_tag", "img", "src", qr/(.*)/, "title", qr/(.*)/);
my $image = "http://m.assetbar.com" . $element->attr_get_i("src");
my $alttext = $element->attr_get_i("title");
$tree->delete();
$rss->add_item(
title => $title,
description => "<![CDATA[<img src="$image" title="$alttext" />]]>",
link => "http://www.achewood.com/index.php?date=" . UnixDate(ParseDate($date), "%m%d%Y"),
pubDate => $date
);
$tree = HTML::TreeBuilder->new_from_content(get("http://www.achewood.com/"));
my @elements = $tree->look_down("_tag", "td", "bgcolor", "#ffffff", "align", "left", "cellpadding", "5", "colspan", "2");
my $status = $elements[2]->find("b")->as_text();
$elements[2]->find("b")->delete();
my $description = $elements[2]->as_text();
my $statusdate;
if ($status =~ m/(d+/d+)/) {
$statusdate = $1 . "/2007";
}
$rss->add_item(
title => $status,
description => $description,
pubDate => UnixDate(ParseDate($statusdate), "%g"),
link => "http://www.achewood.com/"
);
print $rss->as_string();
New York Times Scrape to RSS
#!/usr/bin/perl -w
use strict;
use LWP::Simple;
use HTML::TreeBuilder;
use LWP::Parallel::UserAgent;
use WWW::Mechanize;
use XML::TreeBuilder;
use Getopt::Long;
use HTTP::Cookies;
use Encode;
my $username;
my $password;
my $feedurl;
GetOptions( "user=s" => $username,
"pass=s" => $password,
"url=s" => $feedurl
);
print STDERR "Getting login page...
";
my $cookiejar = HTTP::Cookies->new();
my $mech = WWW::Mechanize->new();
$mech->agent_alias('Linux Mozilla');
$mech->cookie_jar($cookiejar);
$mech->get("http://www.nytimes.com/auth/login");
my $loginresponse = $mech->submit_form(
form_name => 'login',
fields => {
USERID => $username,
PASSWORD => $password
}
);
unless ($loginresponse->is_success()) {
die("Error logging in!
");
}
print STDERR "Logged in successfully!
";
my $pua = LWP::Parallel::UserAgent->new();
$pua->cookie_jar($cookiejar);
$pua->redirect(1);
print STDERR "Getting XML...
";
my $xml = get($feedurl);
my $feed = XML::TreeBuilder->new();
$feed->parse($xml);
my %entries;
print STDERR "Grabbing links...
";
foreach my $item ($feed->look_down("_tag", "item")) {
my $link = $item->look_down("_tag", "link")->as_text();
$link =~ s/?.*//;
my $request = HTTP::Request->new();
print STDERR "Registering $link...
";
$request->uri($link,);
$request->method("GET");
$pua->register($request);
$entries{$link} = $item;
}
print STDERR "Downloading HTML...
";
my $html = $pua->wait();
foreach my $entry (values(%$html)) {
my $response = $entry->response();
my $url = $response->base()->as_string();
$url =~ s/?.*//;
print STDERR "Processing $url...
";
my $item = $entries{$url};
my $articlehtml = HTML::TreeBuilder->new_from_content(
decode_utf8($response->content())
);
if (my $redirelem = $articlehtml->look_down("_tag", "meta", "http-equiv", "refresh")) {
print STDERR "Interstitial ad detected, skipping...
";
my $newurl = $redirelem->attr_get_i("content");
($newurl) = ($newurl =~ m/url=(.*?)/);
$newurl = "http://www.nytimes.com" . $newurl;
print STDERR "Redirect URL is $newurl...
";
}
# Let's clean this up for Liferea, shall we?
my $messyelement;
foreach ($articlehtml->look_down("_tag", "div", "class", "enlargeThis")) {
$_->delete();
}
if ($messyelement = $articlehtml->look_down("_tag", "div", "class", "nextArticleLink")) {
$messyelement->delete();
}
my $description = $entries{$url}->look_down("_tag", "description");
$description->delete_content();
foreach ($articlehtml->look_down("_tag", "div", "class", "image")) {
$description->push_content($_->as_HTML());
}
if ($messyelement = $articlehtml->look_down("_tag", "div", "id", "articleInline")) {
$messyelement->delete();
}
my %pages;
my @pages;
my $pageua = LWP::Parallel::UserAgent->new();
$pageua->cookie_jar($cookiejar);
if (my $pageelem = $articlehtml->look_down("_tag", "div", "id", "pageLinks")) {
print STDERR "Multiple pages detected...
";
foreach ($pageelem->look_down("_tag", "a", "title", qr/^Page/)) {
my $pageurl = "http://www.nytimes.com" . $_->attr_get_i("href");
print STDERR "Registering $pageurl...
";
push(@pages, $pageurl);
my $pagerequest = HTTP::Request->new();
$pagerequest->uri($pageurl);
$pagerequest->method('GET');
$pageua->register($pagerequest);
}
print STDERR "Downloading pages...
";
$pageelem->delete();
}
foreach ($articlehtml->look_down("_tag", "div", "id", "articlebody")) {
$description->push_content($_->as_HTML());
}
my $htmlpages = $pageua->wait();
if ($htmlpages) {
print STDERR "Sorting pages...
";
foreach (values(%$htmlpages)) {
my $pageresponse = $_->response();
my $responseurl = $pageresponse->base()->as_string();
$pages{$responseurl} = $pageresponse;
}
foreach (@pages) {
print STDERR "Processing $_...
";
my $pagehtml = HTML::TreeBuilder->new_from_content(
decode_utf8($pages{$_}->content())
);
# Let's clean this up for Liferea, shall we?
if ($messyelement = $pagehtml->look_down("_tag", "div", "class", "enlargeThis")) {
$messyelement->delete();
}
if ($messyelement = $pagehtml->look_down("_tag", "div", "class", "nextArticleLink")) {
$messyelement->delete();
}
foreach ($pagehtml->look_down("_tag", "div", "class", "image")) {
$description->push_content($_->as_HTML());
}
if ($messyelement = $pagehtml->look_down("_tag", "div", "id", "articleInline")) {
$messyelement->delete();
}
if ($messyelement = $pagehtml->look_down("_tag", "div", "id", "pageLinks")) {
$messyelement->delete();
}
foreach ($pagehtml->look_down("_tag", "div", "id", "articlebody")) {
my $content = $_->as_HTML();
$content =~ s/(Page d+ of d+)//g;
$description->push_content($content);
}
}
}
}
print $feed->as_XML();
Words with many doubled letters
# words with three consecutive doubled letters perl -ne 'print if /(.)1(.)2(.)3/' /usr/share/dict/words # words with three doubled letters, regardless of whether they’re consecutive. perl -ne 'print if /(.)1.(.)2.(.)3/' /usr/share/dict/words # words with four doubled letters perl -ne 'print if /(.)1.*(.)2.*(.)3.*(.)4/' /usr/share/dict/words