#!/usr/bin/perl package Goo::TextUtilities; ############################################################################### # trexy.com - miscellaneous utilities for handling text # # Copyright Nigel Hamilton 2002 # All Rights Reserved # # Author: Nigel Hamilton # Filename: Goo::TextUtilities.pm # Description: Miscellaneous utilities for handling text # # Date Change # ----------------------------------------------------------------------------- # 07/05/2001 Version 1 # 17/03/2003 Expanded to handle HTML, Javascript etc. # 15/05/2003 ' this character was not being stripped from # the HTML sent to the browser needed to strip it out # 03/08/2005 Added getMatchingLineNumber # ############################################################################### use strict; use URI; use Goo::Logger; ############################################################################### # # get_hostname - return a hostname from the url # ############################################################################### sub get_hostname { my ($url) = @_; my $hostname; # prepend http:// to the URL if it is missing - but why would it be missing? # watch out for perltidy! unless ($url =~ /^http:\/\//i) { $url = "http://" . $url; } eval { # catch unwanted exception thrown # this function will die if the protocol is not included (http://) # if the protocol is partially included it won't die but will return null # this was failing during a redirect from dogpile - the redirect worked on # FireFox - but failed on IE???!!! - we suspected an encoding problem my $uri = URI->new($url); $hostname = $uri->host(); }; if ($@) { Goo::Logger::write("Write tried to resolve this URL: $url ." . $@, "/tmp/uri.bug.log"); die("URI bug: $url " . $@); } return $hostname; } ############################################################################### # # strip_hreftags - strip all href tags # ############################################################################### sub strip_hreftags { my ($string) = @_; $string =~ s!] )* >.*?!!gsix; return $string; } ############################################################################### # # uppercase_first_letters - turn the first letters of each word into uppercase # ############################################################################### sub uppercase_first_letters { my ($string) = @_; # substitute at word boundaries # store the word in $1 # set the whole thing to lowercase and the first letter to uppercase $string =~ s/\b([\w\']+)/\L\u$1/g; return $string; } ############################################################################### # # escape_url - escape a url string # ############################################################################### sub escape_url { my ($string) = @_; # substitute any spaces for $string =~ s/ /\+/g; return $string; } ############################################################################### # # strip_funky_html - strip any html that is too funky for a normal tag strip # ############################################################################### sub strip_funky_html { my ($string) = @_; $string =~ s!]*>.*?! !sig; # strip Javascript $string =~ s!]*>.*?! !sig; # strip stylesheets $string =~ s|| |sig; # strip HTML comments $string = strip_html($string); # strip all other tags # strip any html entities like   - this could be better $string =~ s/&[a-zA-Z]{1,4};/ /sig; # strip any numeric entities $string =~ s/&[0-9]{1,4};/ /g; # strip any numeric entities $string =~ s/&\#[0-9]{1,4};/ /g; # strip any parentheses () $string =~ s/\(\W*\)/ /g; # strip any literal carriage returns $string =~ s/\\[rn]/ /g; $string = compress_whitespace($string); return $string; } ############################################################################### # # strip_html - strip the html from a string # ############################################################################### sub strip_html { my ($string) = @_; # strip HTML entities $string =~ s/\<\;//ig; # strip tags $string =~ s/<[^>]*>//g; return $string; } ############################################################################### # # trim_whitespace - strip whitespace from the front and back of a string # ############################################################################### sub trim_whitespace { my ($string) = @_; $string =~ s/^\s+//g; # strip leading whitespace $string =~ s/\s+$//g; # string trailing whitespace return $string; } ############################################################################### # # compress_whitespace - compress excess whitespace from many to 1 space # ############################################################################### sub compress_whitespace { my ($string) = @_; $string =~ s/\s+/ /g; # compress whitespace return $string; } ############################################################################### # # right_pad - pad a string on the righthand side up to a maximum # ############################################################################### sub right_pad { my ($string, $padding, $maxsize) = @_; # truncate the string if longer than maxsize $string = substr($string, 0, $maxsize); # add some padding on the right return $string . $padding x ($maxsize - length($string)); } ############################################################################### # # strip_last_word - strip the last word off the end of a string # ############################################################################### sub strip_last_word { my ($string) = @_; # go to the end of the string and snip off the first bit of # non-whitespace $string =~ s/\S+$//; return $string; } ############################################################################### # # left_pad - pad a string on the lefthand side up to a maximum # ############################################################################### sub left_pad { my ($string, $padding, $maxsize) = @_; # truncate the string if longer than maxsize $string = substr($string, 0, $maxsize); # add some padding on the left return ($maxsize - length($string)) x $padding . $string; } ############################################################################### # # truncate_string - reduce the size of the string and remove the last word # ############################################################################### sub truncate_string { my ($string, $size, $dots) = @_; # print $string; if (length($string) > $size) { #print "--------> in here <----- $size"; $string = substr($string, 0, $size); # print $string; #print $string; # lop off the last word - removes partial words $string = strip_last_word($string); # add dots if we want them if ($dots) { $string .= $dots; } } return $string; } ############################################################################### # # escape_javascript - escape double quotes etc. # ############################################################################### sub escape_javascript { my ($string) = @_; # escape any double quotes, so the Javascript parses OK $string =~ s/"/\\"/g; # strip line feeds $string =~ s/[\n\r]+//g; # strip excess whitespace around = signs $string =~ s/\s+=\s+/=/g; # strip excess whitespace $string =~ s/\s+/ /g; return $string; } ############################################################################### # # get_matching_line_number - return the linenumber that matches the regex # ############################################################################### sub get_matching_line_number { my ($regex, $string) = @_; my @lines = split(/\n/, $string); my $linecount = 0; foreach my $line (@lines) { $linecount++; if ($line =~ /$regex/) { # add 5 to get into the body of the method return $linecount; } } return $linecount; } 1; __END__ =head1 NAME Goo::TextUtilities - Miscellaneous utilities for handling text =head1 SYNOPSIS use Goo::TextUtilities; =head1 DESCRIPTION =head1 METHODS =over =item get_hostname return a hostname from the url =item strip_hreftags strip all href tags in a string =item uppercase_first_letters turn the first letters of each word into uppercase =item escape_url escape a url string =item strip_funky_html strip any HTML that is too funky for a normal tag strip =item strip_html strip the HTML from a string =item trim_whitespace strip whitespace from the front and back of a string =item compress_whitespace compress excess whitespace from many spaces to one space =item right_pad pad a string on the righthand side up to a maximum number of characters =item strip_last_word strip the last word off the end of a string =item left_pad pad a string on the lefthand side up to a maximum =item truncate_string reduce the size of the string and remove the last word =item escape_javascript escape double quotes etc. =item get_matching_line_number return the linenumber that matches the regex =back =head1 AUTHOR Nigel Hamilton =head1 SEE ALSO