#!/usr/bin/perl # # This script will gobble up an HTML page, searching for certain keywords. # When each word is seen, any following links will be printed out as # psuedo-XML. Parts with the ID "discard" will be ignored. # # Author: Bjarni Rúnar Einarsson, http://bre.klaki.net/ # This program is in the public domain. # Please respect people's copyrights! # use LWP::Simple; my $page = get(shift @ARGV); my $prefix = shift @ARGV; # Rewrite page with easy-to-recognize markers for each RSS dump. foreach my $phrase (@ARGV) { my ($id,$term) = split(":", $phrase, 2); print "Got phrase $term for $id\n"; unless ($page =~ s/\Q$term\E/[%BEGIN:$id%]/gs) { print "Uh oh, no match...\n"; } } # Trim crap away. $page =~ s/^.*?\[%BEGIN:/[%BEGIN:/s; $page .= "[%BEGIN:discard%]"; $page =~ s//$1/gs; # Process all found dumps in order. my %rsshash = ( ); while ($page =~ s/\[%BEGIN:([^\s]+)%\](.*?)(\[%BEGIN:[^\s]+%\])/$3/s) { my ($id, $data) = ($1, $2); next if ($id eq "discard"); while ($data =~ s/\]*href=\"?([^\s\"\>]+)[^\>]*\>(.*?)\<\/a\>//is) { my ($link,$title) = ($1,$2); $title =~ s/<[^>]+>//sg; $title =~ s/\s+/ /sg; $link =~ s/\s+/ /sg; $rsshash{$id} .= " \n\t".xmlesc("$prefix$link")."\n\t".xmlesc($title)."\n \n\n"; } } $prefix = xmlesc($prefix); foreach my $id (keys(%rsshash)) { open (RSS, ">$id.rss") || next; print RSS < $id $prefix Autogenerated $id.rss EOF print RSS $rsshash{$id}; print RSS "\n\n"; close(RSS); print "Created: $id.rss\n"; } sub xmlesc { my $text = shift; $text =~ s/&/&/g; $text =~ s//>/g; return $text; }