This is the code :
@lines is a list of urls.
this code is used to find the link in a remote html file.
Unfortunately,
may occur in $correct_url...
How can I remove it?
-------------------------------
foreach $line (@lines) {
use LWP::Simple;
$broken = 0;
my $url = "$line";
local $SIG{ALRM} = \&died;
alarm 10;
my $results = get ($url);
alarm 0;
$results or $broken = 1;
if ($broken) {
open(BROK, ">>data/broken.txt");
flock(BROK, 2);
print BROK "$url";
close(BROK);
}
else {
while ($results =~ m#href="([^"]+)"#sog) {
$unchecked_link = $1;
unless ($unchecked_link =~ /http:/i or $unchecked_link =~ /mailto:/i or $unchecked_link =~ /#/i or $unchecked_link =~ /ftp:/i or !$unchecked_link =~ /.htm/i or !$unchecked_link =~ /.shtml/i) {
$url =~ s/\n//g;
if ($url =~ /http:\/\/([^"]+)\/([^<]+).html/i) {$url =~ s/$2.html//g;}
if ($url =~ /http:\/\/([^"]+)\/([^<]+).htm/i) {$url =~ s/$2.htm//g;}
if ($url =~ /http:\/\/([^"]+)\/([^<]+).shtml/i) {$url =~ s/$2.shtml//g;}
$correct_url = "$url$unchecked_link";
$correct_url =~ s/
//;
$extended_url .= "$correct_url\n";
}}}}
---------------------------------------