#!/usr/bin/perl # # This script will gobble up an HTML page, searching for # ... pairs and converting # them to RSS. This should manage to create RSS from most reasonably # well done "blogs" out there. # # Author: Bjarni Rúnar Einarsson, http://bre.klaki.net/ # This program is in the public domain. # Please respect people's copyrights! # use LWP::Simple; my $pageurl = shift @ARGV; my $prefix = shift @ARGV; my $mode = shift @ARGV; my $prefix_dir = $prefix; $prefix_dir =~ s/[^\/]+$//; my $prefix_root = $prefix; $prefix_root =~ s/^(http:\/\/[^\/]+)\/.*$/$1/i; my $page = get($pageurl); my $pagename = "Some Blog."; if ($page =~ /(.*?)<\/title>/is) { $pagename = $1; } my $language = "en"; $page =~ s/þ/þ/g; $page =~ s/&Thorn;/Þ/g; $page =~ s/é/é/g; $page =~ s/É/É/g; $page =~ s/á/á/g; $page =~ s/Á/Á/g; $language = "is" if ($page =~ /([þÞ](etta|essu)|[Éé]g|a[ðÐ]|j[áÁúÚ]| og )/i); my $escpre = xmlesc($prefix); print <<EOF; <?xml version="1.0" encoding="ISO-8859-1"?> <!DOCTYPE rss PUBLIC "-//Netscape Communications//DTD RSS 0.92//EN" "http://www.scripting.com/dtd/rss-0_91.dtd" > <rss version="0.92"> <channel> <title>$pagename $escpre $language Autogenerated $id.rss EOF # Process all found dumps in order. my $found = 0; #$prefix =~ s/\/[^\/]*\.html?/\//i; while (($found < 14) && ($page =~ s/]*name=\"(\d+)\"[^>]*>(.*?)]*href=\"?([^>\"]*?\#)\s*\1\"?[^>]*>//is)) { my ($id, $title, $e, $url) = ($1, gettitle($2), $2, $3); my $desc = getdesc($e, $title); $found++; $url =~ s/\s//gs; $url = addprefix($url); # Kill title. $desc =~ s/^.*?\Q$title\E//; $desc =~ s/^\s*(
\s*)+//; print "\n"; print "\t$title\n"; if ($mode =~ /transient/i) { print "\t", xmlesc("$pageurl#$id"), "\n"; } else { print "\t", xmlesc("$url$id"), "\n"; } print "\t", xmlesc("$desc"), "\n"; print "\n"; } if ($found) { # Found entries, we're done. print "\n\n"; exit(0); } $page =~ s/\<(script|style).+?\<\/\1\>//gis; while ($page =~ s/]*name=\"?(\d+)\"?[^>]*>(.{20,300})/$2/is) { my ($id, $title, $e) = ($1, gettitle($2), $2); my $desc = getdesc($e, $title); $found++; print "\n"; print "\t", xmlesc($title), "\n"; print "\t", xmlesc("$prefix#$id"), "\n"; print "\t", xmlesc($desc), "\n"; print "\n"; } print "\n\n"; sub gettitle { my $entry = shift; $entry =~ s/<[^>]+$//s; # Remove this ugly stuff first... # $entry =~ s/<(script|style)([^>]+|\"[^\"]*\")*[^>]*>.*?<\/\1[^>]*>//gis; $entry =~ s/<\/?table([^>]*?\"[^\"]*\")*[^>]*>//gis; # Remove line breaks and   entities... $entry =~ s/\n/ /gs; $entry =~ s/ / /gs; # Prefer bold stuff if it's near the beginning. $entry =~ s/^.{0,60}?<(b|h\d)>(.+?)<\/\1>/$2\n/gs; # Convert
,

and

tags to line breaks. $entry =~ s/
/\n/gis; $entry =~ s/<\/?(p|div|tr)([^>]*?\"[^\"]*\")*[^>]*>/\n\n/gis; # Kill the rest of the HTML markup. $entry =~ s/<([^>]*?\"[^\"]*\")*[^>]*>//gs; # Kill anything that looks like a date # $entry =~ s/^\s*\d+[\.-]\d+[\.-]\d+\s*//s; # Kill weird punctuation $entry =~ s/\s[\.:\[\]\(\)\\\/;]\s/ /gs; # Kill leading weirdness $entry =~ s/^[^A-Z0-9a-záéíóúýþæöðÁÉÍÓÚÝÞÆÖÐ_-]*//s; # Chop everything but the first sentance away, compact white space. $entry =~ s/\n.*$//s; $entry =~ s/\s+/ /gs; $entry =~ s/([^\d][\!\.\?]+)\s.*$/$1/s; $entry =~ s/^(.{50,65})\s+.*$/$1/; $entry =~ s/^(.{65,65}).*$/$1/; # Replace empty entries with "no subject found" $entry =~ s/^\s*$/(no subject found)/s; return $entry; } sub getdesc { my $entry = shift; my $title = shift; # Kill HTML which got truncated... $entry =~ s/<[^>]+$//s; # Grab URLS for later processing. my @urls = ( ); while ((@urls < 5) && ($entry =~ s/<(?:a|img)\s+.*?(?:href|src)=\"?\s*([^\">]+)\"?/]+|\"[^\"]*\")*[^>]*>.*?<\/\1[^>]*>//gis; $entry =~ s/ / /gsi; $entry =~ s/<(p|br|hr|div)[^>]*>/
/gis; $entry =~ s/]>/[img]/gis; $entry =~ s/<([^>]*?\"[^\"]*\")*[^>]*>//gs; # Remove/compact white space... $entry =~ s/\s+/ /gs; $entry =~ s/(
\s*)+/
/gs; $entry =~ s/^\s*(
\s*)*//; $entry =~ s/(
\s*)*$//; # Kill title. $entry =~ s/^.*?\Q$title\E//; $entry =~ s/^\s*(
\s*)+//; # Truncate... if ($entry =~ s/^(.{124,124}).*/$1/) { $entry =~ s/<[^>]+$//i; $entry .= " ..."; } # Append URLs to description. my $i = 1; foreach my $url (@urls) { next if ($url =~ /^(javascript|mailto:)/i); $url = addprefix($url); $entry .= " [
$i]"; $i++; } return $entry; } sub addprefix { my $url = shift; if ($url !~ /^(http|ftp|mailto|javascript):/i) { if ($url =~ /^\//) { return $prefix_root.$url; } else { return $prefix_dir.$url; } } return $url; } sub xmlesc { my $text = shift; $text =~ s/&/&/g; $text =~ s//>/g; return $text; }