lwp-request -o links <a href="file:///SAVED_GOOGLE_RESULTS.htm" >file:///SAVED_GOOGLE_RESULTS.htm</a>|grep -P "As*http://w*.MY_DOMAIN" | perl -pe "m#As*(.*)#; $notify = qq{
$1:
}; $_=qx{lwp-request $1|tidy -eq 2>&1|grep -e Error -e DOCTYPE}; $_ = $notify .$_ if $_" > report.txt
#Alternate: print out just the HTTP response code for linked pages that have my domain in the link
lwp-request -o links <a href="http://onemorebug.com" >http://onemorebug.com</a>|perl -pe "chomp; $_ =~ s#w*s*##; undef $_ unless m/onemorebug.com/; $_ .= qq{ } . qx{lwp-request -ds $_} if $_"
#old version
lwp-request -o links <a href="file:///C:/SAVED_GOOGLE_RESULTS.htm" >file:///C:/SAVED_GOOGLE_RESULTS.htm</a>|grep -P "As*http://w*.MY_DOMAIN" | perl -pe "m#As*(.*)#; $notify = qq{ $1: }; $_=qx{lwp-request $1|tidy -e 2>&1 | grep "DOCTYPE"}; print $notify if $_"
check linked pages for Tidy validation errors, on the command line
Category: Uncategorized |
Tags: Bash, command, commandline, crawler, crawling, one-liners, perl, scraping, textonly, tidy, validation
Leave a Reply
You must be logged in to post a comment.