Tag Archive for perl

Tiff2PDF converter for NYBC Files

#!/usr/bin/env perl

use warnings;
use strict;
use Getopt::Std;
use PDF::FromImage;

our %opts;

getopt('d:f:o:',  %opts);

if (!%opts) {

print STDERR << "EOF";

Usage: [-d directory][-f image file] -o output.pdf

EOF
exit;
}

elsif ($opts{d}) {

my $pdf = PDF::FromImage->new;

my @files = <$opts{d}/*.tif>;

my @rfiles = reverse(@files);

$pdf->load_images(@rfiles);

$pdf->write_file("$opts{o}");
}

if ($opts{f}) {
my $pdf = PDF::FromImage->new;

$pdf->load_images("$opts{f}");

$pdf->write_file("$opts{o}");
}

source

Apache PHP MAMP デスクトップでCGIを動かすhttpd

#
# ScriptAlias: This controls which directories contain server scripts.
# ScriptAliases are essentially the same as Aliases, except that
# documents in the realname directory are treated as applications and
# run by the server when requested rather than as documents sent to the client.
# The same rules about trailing "/" apply to ScriptAlias directives as to
# Alias.
#
ScriptAlias /cgi-bin/ "/Applications/MAMP/cgi-bin/"

<IfModule mod_cgid.c>
#
# Additional to mod_cgid.c settings, mod_cgid has Scriptsock <path>
# for setting UNIX socket for communicating with cgid.
#
#Scriptsock            logs/cgisock
</IfModule>

#
# "/Applications/MAMP/cgi-bin" should be changed to whatever your ScriptAliased
# CGI directory exists, if you have that configured.
#
<Directory "/Applications/MAMP/cgi-bin">
AllowOverride None
Options None
Order allow,deny
Allow from all
</Directory>

# sub dir

<Directory "/Users/*/Desktop">
Options ExecCGI
AddHandler cgi-script .cgi .pl
</Directory>

source

Get Your War On Scrape to RSS

#!/usr/bin/perl

use HTML::Entities;
use LWP::Simple;

# print a feed header
print "<?xml version="1.0" encoding="ISO-8859-1"?>
".
"<rdf:RDF
".
"xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
".
"xmlns:content="http://purl.org/rss/1.0/modules/content/"
".
"xmlns="http://my.netscape.com/rdf/simple/0.9/">
".
"<channel>
".
"  <title>Get Your War On</title>
".
"  <link>http://www.mnftiu.cc/mnftiu.cc/war.html</link>
".
"  <description>A webcomic about our 9/11 epilogue.</description>
".
"</channel>

";

$html_string = get ("http://www.mnftiu.cc/mnftiu.cc/war.html");

$i = 2;

while ($html_string =~ m/<a href="war(d|dd).html">(d|dd)</a>/g)
{
$i++
}

$url = "http://www.mnftiu.cc/mnftiu.cc/war" . $i . ".html";

$html_string = get ($url);

while ($html_string =~ m/<img src="images/gywo.(.*?).gif" border=0>/g)
{
print	"<item>
".
"<title>" . $1 . "</title>
".
"<link>" . $url . "</link>
".
"<description>&lt;img src="http://www.mnftiu.cc/mnftiu.cc/images/gywo." . $1 . ".gif"&gt;</description>
";

print "</item>

";
}

print "</rdf:RDF>
";

source

Broken Gallup Election 2008 Polls Scrape to RSS

#!/usr/bin/perl -w

use strict;
use HTML::TreeBuilder;
use XML::RSS;
use LWP::Parallel::UserAgent;
use LWP::Simple;

my @images;

my $pua = LWP::Parallel::UserAgent->new();
$pua->agent('Mozilla/5.0 (X11; U; Butt Linux i686; en-US; rv:1.9.0.1) Gecko/2008072820 Firefox/3.0.1');

my $base_url = "http://www.gallup.com/poll/election2008.aspx";

my $base_tree = HTML::TreeBuilder->new_from_content(
get($base_url)
);

foreach my $element ($base_tree->look_down("_tag", "div", "class", "section electionanalyses")) {
if (my $img_element = $element->look_down("_tag", "img")) {
push(@images, $img_element->attr_get_i("src"));
}
}

my %trees;

foreach my $element ($base_tree->look_down("_tag", "div", "class", "section electionanalyses topics")->look_down("_tag", "a")) {
$trees{$element->attr_get_i("href")} = "";
}

foreach my $topic_url (keys(%trees)) {
my $request = HTTP::Request->new();

$request->uri($topic_url);
$request->method("GET");

$pua->register($request);
}

my $html = $pua->wait();

foreach my $entry (values(%$html)) {
my $tree = HTML::TreeBuilder->new_from_content(
$entry->response()->content()
);

push(@images, $tree->look_down("_tag", "div", "class", "cmsbody")->look_down("_tag", "img")->attr_get_i("src"));
}

my $rss = XML::RSS->new();

$rss->channel(	title		=>	"Gallop Election 2008 Polls",
link		=>	$base_url,
language	=>	"en",
description	=>	"Gallup has studied human nature and behavior for more than 70 years. Gallup's reputation for delivering relevant, timely, and visionary research on what people around the world think and feel is the cornerstone of the organization. Gallup employs many of the world's leading scientists in management, economics, psychology, and sociology, and our consultants assist leaders in identifying and monitoring behavioral economic indicators worldwide. Gallup consultants help organizations boost organic growth by increasing customer engagement and maximizing employee productivity through measurement tools, coursework, and strategic advisory services. Gallup's 2,000 professionals deliver services at client organizations, through the Web, at Gallup University's campuses, and in 40 offices around the world.",
ttl		=>	"300"
);

use Date::Manip;
my $date = UnixDate(ParseDate(`date`), "%g");

my $description = "";
foreach my $img (@images) {
$description .= "<img src="$img" /><br />
";
}

$rss->add_item(
title		=>	"Gallup Polls - $date",
description	=>	$description,
link		=>	$base_url,
pubDate		=>	$date,
category	=>	"politics",
author		=>	"Gallup, Incorporated",
);

print $rss->as_string();

source

Found Magazine RSS Tidy Up

#!/usr/bin/perl -w

use strict;
use LWP::Simple;

my $feed = get("http://foundmagazine.com/feed/news");

$feed =~ s//thumb///full//g;

print $feed;

source

Engrish Scrape to RSS

#!/usr/bin/perl -w

use strict;
use LWP::Simple;
use XML::RSS;
use HTML::TreeBuilder 3.0;
use Date::Manip;

my $rss = XML::RSS->new(version => '2.0');

$rss->channel(	title		=>	"Engrish",
link		=>	"http://www.engrish.com/",
language	=>	"en",
description	=>	"",
webmaster	=>	"",
ttl		=>	"300"
);

$rss->skipDays(
#		day	=>	"Sunday",
#		day	=>	"Monday",
#		day	=>	"Tuesday",
#		day	=>	"Wednesday",
#		day	=>	"Thursday",
#		day	=>	"Friday",
#		day	=>	"Saturday"
);

# Some kind of loop here to grab feeds.

my $recent = HTML::TreeBuilder->new_from_content(
get("http://www.engrish.com/recent.php"));
my @pictures;
foreach my $tdtag ($recent->look_down("_tag", "td", "bgcolor", "#CCCCCC", "width", "120", "height", "120")) {
push(@pictures, $tdtag->look_down("_tag", "a")->attr_get_i("href"));
}

foreach my $picture (@pictures) {
(my $date) = ($picture =~ m/date=(.*?)$/);
$date = UnixDate(ParseDate($date), "%g");

my $item = HTML::TreeBuilder->new_from_content(get("http://www.engrish.com/$picture"));
my $image = $item->look_down("_tag", "img", "src", qr/image/engrish//);
$image->attr("src", "http://www.engrish.com/" . $image->attr_get_i("src"));
$image = $image->as_HTML();
my $caption = $item->look_down("_tag", "em")->as_text();
my $title = $item->look_down("_tag", "title")->as_text();
my $quip = $item->look_down("_tag", "font", "face", "Times New Roman, Times, serif")->as_text();
($quip) = ($quip =~ m/(.*?)Photo/);

$rss->add_item(	title		=>	"$title",
description	=>	"<![CDATA[<p align="center">$quip<br />$image<br />$caption</p>",
link		=>	"http://www.engrish.com/$picture",
pubDate		=>	"$date",
category	=>	"humor",
author		=>	"",
);
}
print $rss->as_string();

source

APOD Scrape to RSS

#!/usr/bin/perl -w

use strict;
use LWP::Simple;
use LWP::Parallel::UserAgent;
use XML::RSS;
use HTML::TreeBuilder 3.0;
use Date::Manip;

my $rss = XML::RSS->new(version => '2.0');
my $url = "http://antwrp.gsfc.nasa.gov/apod/";
my $pua = LWP::Parallel::UserAgent->new();

$rss->channel(	title		=>	"Astronomy Picture of the Day",
link		=>	$url,
language	=>	"en",
description	=>	"Discover the cosmos! Each day a different image or photograph of our fascinating universe is featured, along with a brief explanation written by a professional astronomer.",
webmaster	=>	"",
ttl		=>	"480"
);

$rss->skipDays(
#		day	=>	"Sunday",
#		day	=>	"Monday",
#		day	=>	"Tuesday",
#		day	=>	"Wednesday",
#		day	=>	"Thursday",
#		day	=>	"Friday",
#		day	=>	"Saturday"
);

my $archive = get("http://antwrp.gsfc.nasa.gov/apod/archivepix.html");
my @pages;

my %dates;
my %titles;
while ($archive =~ m/(.*?):  <a href="(.*?)">(.*?)</a>/g) {
$dates{$url . $2} = $1;
$titles{$url . $2} = $3;
push(@pages, $2);
}

for (my $i = 0; $i < 20; $i++) {
my $link = $url . $pages[$i];

my $httprequest = HTTP::Request->new();
$httprequest->uri($link);
$httprequest->method("GET");

$pua->register($httprequest);
}

my $html = $pua->wait();

foreach my $pod (values(%$html)) {
my $response = $pod->response();
my $podurl = $response->base()->as_string();

my $article = HTML::TreeBuilder->new_from_content(
$response->content()
);

my @chunks = $article->look_down("_tag", "p");
my $description = $chunks[1]->as_HTML() . $chunks[2]->as_HTML();
my $date = UnixDate(ParseDate($dates{$podurl}), "%g");

$rss->add_item(	title		=>	"$titles{$podurl}",
description	=>	"$description",
link		=>	"$podurl",
pubDate		=>	"$date",
category	=>	"science",
author		=>	"Robert Nemiroff, Jerry Bonnell",
);
}

print $rss->as_string();

source

Achewood Scrape to RSS

#!/usr/bin/perl -w

use strict;
use LWP::Simple;
use XML::RSS;
use HTML::TreeBuilder 3.0;
use Date::Manip;

my $rss = XML::RSS->new(version => '2.0');

$rss->channel(
title		=>	"Achewood",
link		=>	"http://www.achewood.com/",
language	=>	"en",
description	=>	""Achewood," like wormwood, was used by antebellum slaves in the production of "achewater," a long-since outmoded and outlawed Southern beverage.

Drinkers of achewater experienced hallucinations
and euphoria, but the after-effects of the liquor produced a deep and lasting melancholy (hence its name).

Modern science has confirmed that achewood oil, the active ingredient in achewater, is a powerful depressant which causes irreversible neurological damage.

Achewater is generally thought to have inspired many Southern folk songs and fables, such as "The Story of Poor John Ritch," "Sullivan's Bear and Dried Bird" and "I'm Following a Little Round Lord."",
webmaster	=>	"chris@achewood.com",
ttl		=>	"300"
);

$rss->skipDays(
day	=>	"Sunday",
#		day	=>	"Monday",
#		day	=>	"Tuesday",
#		day	=>	"Wednesday",
#		day	=>	"Thursday",
#		day	=>	"Friday",
day	=>	"Saturday"
);

my $url = "http://m.assetbar.com/achewood/";

my $tree = HTML::TreeBuilder->new_from_content(get($url));
my $element = $tree->look_down("_tag", "span", "class", "date");
my $date = UnixDate(ParseDate($element->as_text()), "%g");
$element = $tree->find("title");
my $title = $element->as_text();
$element = $tree->look_down("_tag", "img", "src", qr/(.*)/, "title", qr/(.*)/);
my $image = "http://m.assetbar.com" . $element->attr_get_i("src");
my $alttext = $element->attr_get_i("title");

$tree->delete();

$rss->add_item(
title		=>	$title,
description	=>	"<![CDATA[<img src="$image" title="$alttext" />]]>",
link		=>	"http://www.achewood.com/index.php?date=" . UnixDate(ParseDate($date), "%m%d%Y"),
pubDate		=>	$date
);

$tree = HTML::TreeBuilder->new_from_content(get("http://www.achewood.com/"));
my @elements = $tree->look_down("_tag", "td", "bgcolor", "#ffffff", "align", "left", "cellpadding", "5", "colspan", "2");
my $status = $elements[2]->find("b")->as_text();
$elements[2]->find("b")->delete();
my $description = $elements[2]->as_text();

my $statusdate;
if ($status =~ m/(d+/d+)/) {
$statusdate = $1 . "/2007";
}

$rss->add_item(
title		=>	$status,
description	=>	$description,
pubDate		=>	UnixDate(ParseDate($statusdate), "%g"),
link		=>	"http://www.achewood.com/"
);

print $rss->as_string();

source

New York Times Scrape to RSS

#!/usr/bin/perl -w

use strict;
use LWP::Simple;
use HTML::TreeBuilder;
use LWP::Parallel::UserAgent;
use WWW::Mechanize;
use XML::TreeBuilder;
use Getopt::Long;
use HTTP::Cookies;
use Encode;

my $username;
my $password;
my $feedurl;

GetOptions(	"user=s"	=> $username,
"pass=s"	=> $password,
"url=s"		=> $feedurl
);

print STDERR "Getting login page...
";

my $cookiejar = HTTP::Cookies->new();

my $mech = WWW::Mechanize->new();

$mech->agent_alias('Linux Mozilla');
$mech->cookie_jar($cookiejar);

$mech->get("http://www.nytimes.com/auth/login");

my $loginresponse = $mech->submit_form(
form_name	=>	'login',
fields		=>	{
USERID		=>	$username,
PASSWORD	=>	$password
}
);

unless ($loginresponse->is_success()) {
die("Error logging in!
");
}

print STDERR "Logged in successfully!
";

my $pua = LWP::Parallel::UserAgent->new();
$pua->cookie_jar($cookiejar);
$pua->redirect(1);

print STDERR "Getting XML...
";
my $xml = get($feedurl);
my $feed = XML::TreeBuilder->new();
$feed->parse($xml);
my %entries;

print STDERR "Grabbing links...
";

foreach my $item ($feed->look_down("_tag", "item")) {
my $link = $item->look_down("_tag", "link")->as_text();
$link =~ s/?.*//;

my $request = HTTP::Request->new();

print STDERR "Registering $link...
";

$request->uri($link,);
$request->method("GET");

$pua->register($request);

$entries{$link} = $item;
}

print STDERR "Downloading HTML...
";
my $html = $pua->wait();

foreach my $entry (values(%$html)) {
my $response = $entry->response();
my $url = $response->base()->as_string();
$url =~ s/?.*//;

print STDERR "Processing $url...
";

my $item = $entries{$url};

my $articlehtml = HTML::TreeBuilder->new_from_content(
decode_utf8($response->content())
);

if (my $redirelem = $articlehtml->look_down("_tag", "meta", "http-equiv", "refresh")) {
print STDERR "Interstitial ad detected, skipping...
";
my $newurl = $redirelem->attr_get_i("content");

($newurl) = ($newurl =~ m/url=(.*?)/);
$newurl = "http://www.nytimes.com" . $newurl;

print STDERR "Redirect URL is $newurl...
";
}

# Let's clean this up for Liferea, shall we?
my $messyelement;

foreach ($articlehtml->look_down("_tag", "div", "class", "enlargeThis")) {
$_->delete();
}
if ($messyelement = $articlehtml->look_down("_tag", "div", "class", "nextArticleLink")) {
$messyelement->delete();
}

my $description = $entries{$url}->look_down("_tag", "description");
$description->delete_content();

foreach ($articlehtml->look_down("_tag", "div", "class", "image")) {
$description->push_content($_->as_HTML());
}

if ($messyelement = $articlehtml->look_down("_tag", "div", "id", "articleInline")) {
$messyelement->delete();
}

my %pages;
my @pages;
my $pageua = LWP::Parallel::UserAgent->new();
$pageua->cookie_jar($cookiejar);

if (my $pageelem = $articlehtml->look_down("_tag", "div", "id", "pageLinks")) {
print STDERR "Multiple pages detected...
";

foreach ($pageelem->look_down("_tag", "a", "title", qr/^Page/)) {
my $pageurl = "http://www.nytimes.com" . $_->attr_get_i("href");
print STDERR "Registering $pageurl...
";

push(@pages, $pageurl);
my $pagerequest = HTTP::Request->new();
$pagerequest->uri($pageurl);
$pagerequest->method('GET');

$pageua->register($pagerequest);
}

print STDERR "Downloading pages...
";

$pageelem->delete();
}

foreach ($articlehtml->look_down("_tag", "div", "id", "articlebody")) {
$description->push_content($_->as_HTML());
}

my $htmlpages = $pageua->wait();

if ($htmlpages) {
print STDERR "Sorting pages...
";
foreach (values(%$htmlpages)) {
my $pageresponse = $_->response();
my $responseurl = $pageresponse->base()->as_string();

$pages{$responseurl} = $pageresponse;
}

foreach (@pages) {
print STDERR "Processing $_...
";

my $pagehtml = HTML::TreeBuilder->new_from_content(
decode_utf8($pages{$_}->content())
);

# Let's clean this up for Liferea, shall we?

if ($messyelement = $pagehtml->look_down("_tag", "div", "class", "enlargeThis")) {
$messyelement->delete();
}
if ($messyelement = $pagehtml->look_down("_tag", "div", "class", "nextArticleLink")) {
$messyelement->delete();
}

foreach ($pagehtml->look_down("_tag", "div", "class", "image")) {
$description->push_content($_->as_HTML());
}

if ($messyelement = $pagehtml->look_down("_tag", "div", "id", "articleInline")) {
$messyelement->delete();
}

if ($messyelement = $pagehtml->look_down("_tag", "div", "id", "pageLinks")) {
$messyelement->delete();
}

foreach ($pagehtml->look_down("_tag", "div", "id", "articlebody")) {
my $content = $_->as_HTML();
$content =~ s/(Page d+ of d+)//g;
$description->push_content($content);
}
}
}
}

print $feed->as_XML();

source

Words with many doubled letters

# words with three consecutive doubled letters
perl -ne 'print if /(.)1(.)2(.)3/' /usr/share/dict/words

# words with three doubled letters, regardless of whether they’re consecutive.
perl -ne 'print if /(.)1.(.)2.(.)3/' /usr/share/dict/words

# words with four doubled letters
perl -ne 'print if /(.)1.*(.)2.*(.)3.*(.)4/' /usr/share/dict/words

source