# $Id: Schemeless.pm,v 1.8 2005/03/22 16:03:11 roderick Exp $ # # Copyright (c) 2000 Michael G. Schwern. All rights reserved. This # program is free software; you can redistribute it and/or modify it # under the same terms as Perl itself. package URI::Find::Schemeless; use strict; use base qw(URI::Find); # base.pm error in 5.005_03 prevents it from loading URI::Find if I'm # required first. use URI::Find (); use vars qw($VERSION); $VERSION = q$Revision: 1.8 $ =~ /(\d\S+)/ ? $1 : '?'; my($dnsSet) = 'A-Za-z0-9-'; my($cruftSet) = __PACKAGE__->cruft_set . '<>?}'; my($tldRe) = __PACKAGE__->top_level_domain_re; my($uricSet) = __PACKAGE__->uric_set; =head1 NAME URI::Find::Schemeless - Find schemeless URIs in arbitrary text. =head1 SYNOPSIS require URI::Find::Schemeless; my $finder = URI::Find::Schemeless->new(\&callback); The rest is the same as URI::Find. =head1 DESCRIPTION URI::Find finds absolute URIs in plain text with some weak heuristics for finding schemeless URIs. This subclass is for finding things which might be URIs in free text. Things like "www.foo.com" and "lifes.a.bitch.if.you.aint.got.net". The heuristics are such that it hopefully finds a minimum of false positives, but there's no easy way for it know if "COMMAND.COM" refers to a web site or a file. =cut sub schemeless_uri_re { @_ == 1 || __PACKAGE__->badinvo; return qr{ # Originally I constrained what couldn't be before the match # like this: don't match email addresses, and don't start # anywhere but at the beginning of a host name # (?()\{\}\[\]]) ) # hostname (?: [$dnsSet]+(?:\.[$dnsSet]+)*\.$tldRe | (?:\d{1,3}\.){3}\d{1,3} ) # not inet_aton() complete (?: (?=[\s\Q$cruftSet\E]) # followed by unrelated thing (?!\.\w) # but don't stop mid foo.xx.bar (? my $tld_re = $self->top_level_domain_re; Returns the regex for matching top level DNS domains. The regex shouldn't be anchored, it shouldn't do any capturing matches, and it should make itself ignore case. =cut sub top_level_domain_re { @_ == 1 || __PACKAGE__->badinvo; my($self) = shift; my $plain = join '|', qw( aero biz com coop edu gov info int mil museum name net org pro ); return qr/(?:[a-z]{2}|$plain)/; } =head1 AUTHOR Original code by Roderick Schertler , adapted by Michael G Schwern . Currently maintained by Roderick Schertler . =head1 SEE ALSO L =cut 1;