package HTML::FormatData; =pod =head1 NAME HTML::FormatData - formats strings and dates for web display/storage =head1 SYNOPSIS use HTML::FormatData; my $f = HTML::FormatData->new(); my $string = "bolded"; my $formatted = $f->format_text( $string, strip_html=>1 ); # $string eq 'bolded' my $dt = $f->parse_date( $dt_string, '%Y%m%d%H%M%S' ); my $yrmoday = $f->format_date( $dt, '%Y%m%d' ); $yrmoday = $f->reformat_date( $dt_string, '%Y%m%d%H%M%S', '%Y%m%d' ); # shortcut =head1 DESCRIPTION HTML::FormatData contains utility functions to format strings and dates. These utilities are useful for formatting data to be displayed on webpages, or for cleaning and date data during server-side validation before storage in a database or file. While doing web development work in the past, I noticed that I was having to do the same operations time and again: strip HTML from form submissions, truncate strings for display as table data, URI-encode strings for use in links, translate Unix timestamps into mm/dd/yyyy format, etc. Rather than try to keep straight the different modules and functions used, I decided to write a wrapper with a single, consistent interface. =head1 METHODS =cut use 5.006; use strict; use warnings; use Carp qw( croak ); use DateTime; use DateTime::Format::Strptime; use HTML::Entities; use HTML::Parser; use URI::Escape; our $VERSION = '0.10'; =pod =head2 new() This method creates a new HTML::FormatData object. Returns the blessed object. =cut sub new { my $class = shift; my $config = shift; bless {}, $class; } =pod =head2 format_text( $string, %args )> Wrapper function for the text formatting routines below. Formats a string according to parameters passed in. While the functions this routine calls can be called directly, it will usually be best to always go thru this function. Returns the formatted string. =cut sub format_text { my $self = shift; my $string = shift; croak( "Odd number of parameters passed to format_text." ) if @_ % 2; my %args = @_; return unless defined $string; return '' if $string eq ''; my @jobs = qw( decode_xml decode_html decode_uri strip_html strip_whitespace clean_high_ascii clean_encoded_html clean_encoded_text clean_whitespace clean_whitespace_keep_full_breaks clean_whitespace_keep_all_breaks force_lc force_uc truncate truncate_with_ellipses encode_xml encode_html encode_uri ); foreach my $job ( @jobs ) { next unless exists $args{$job}; if ( $job =~ /^truncate/ ) { $string = $self->$job( $string, $args{$job} ); } else { $string = $self->$job( $string ); } } return $string; } =pod =head2 decode_xml( $string ) A copy of XML::Comma::Util::XML_basic_unescape. Returns an XML-unescaped string. =cut sub decode_xml { my $self = shift; my $string = shift; $string =~ s/\&/&/g ; $string =~ s/\<//g ; return $string; } =pod =head2 decode_html( $string ) Returns an HTML-unescaped string. =cut sub decode_html { my $self = shift; my $string = shift; return HTML::Entities::decode( $string ); } =pod =head2 decode_uri( $string ) Returns an URI-unescaped string. =cut sub decode_uri { my $self = shift; my $string = shift; return URI::Escape::uri_unescape( $string ); } =pod =head2 strip_html( $string ) Strips all HTML tags from string. Returns string. =cut sub strip_html { my $self = shift; my $string = shift; our $output; $output = ''; sub default_handler { $output .= shift; } my $p = HTML::Parser->new( api_version => 3 ); $p->handler( default => \&default_handler, "text" ); $p->handler( start => "" ); $p->handler( end => "" ); $p->handler( comment => '' ); $p->handler( declaration => '' ); $p->handler( process => '' ); $p->ignore_elements( qw( script style ) ); $p->parse( "$string " ); return $output; } =pod =head2 strip_whitespace( $string ) Strips all whitespace ( \s ) characters from string. Returns string. =cut sub strip_whitespace { my $self = shift; $_ = shift; s/\s+//g; return $_; } =pod =head2 clean_high_ascii( $string ) Converts 8-bit ascii characters to their 7-bit counterparts. Tested with MS-Word documents; might not work right with high-ascii text from other sources. Returns string. =cut sub clean_high_ascii { my $self = shift; $_ = shift; my ( $high, $low ); ### single quotes $high = chr(145); $high = qr{$high}; $low = qr{'}; s/$high/$low/g; $high = chr(146); $high = qr{$high}; s/$high/$low/g; ### double quotes $high = chr(147); $high = qr{$high}; $low = qr{"}; s/$high/$low/g; $high = chr(148); $high = qr{$high}; s/$high/$low/g; ### endash $high = chr(150); $high = qr{$high}; $low = qr{-}; s/$high/$low/g; ### emdash $high = chr(151); $high = qr{$high}; $low = qr{--}; s/$high/$low/g; ### ellipsis $high = chr(133); $high = qr{$high}; $low = qr{...}; s/$high/$low/g; ### unknown $high = chr(194); $high = qr{$high}; s/$high//g; return $_; } =pod =head2 clean_html_encoded_text( $string ) Properly encodes some entities skipped by HTML::Entities::encode. Returns the modified string. =cut sub clean_html_encoded_text { my $self = shift; $_ = shift; ### properly encode m-dashes s/\—/\—/g; s/--/\—/g; ### properly encode ellipses s/\.\.\./\…/g; ### encode apostrophes #s/'/’/g; return $_; } =pod =head2 decode_select_entities( $string ) Takes HTML::Entities::encoded HTML and selectively unencodes certain entities for display on webpage. Returns modified string. =cut sub decode_select_entities { my $self = shift; $_ = shift; ### restore angle brackets s/\<//g; ### restore quotes inside angle brackets 1 while s/(<[^>]*)(\")/$1\"/gs; return $_; } =pod =head2 clean_encoded_html( $string ) Formats HTML-encoded HTML for display on webpage. Returns modified string. =cut sub clean_encoded_html { my $self = shift; my $string = shift; $string = $self->decode_select_entities( $string ); $string = $self->clean_html_encoded_text( $string ); return $string; } =pod =head2 clean_encoded_text( $string ) Formats HTML-encoded text for display on webpage. Returns modified string. =cut sub clean_encoded_text { my $self = shift; my $string = shift; $string = $self->clean_html_encoded_text( $string ); return $string; } =pod =head2 clean_whitespace( $string [keep_full_breaks => 1 | keep_all_breaks => 1] ) Cleans up whitespace in HTML and plain text. If passed an argument for handling line breaks, it will either keep full breaks (\n\n) or all breaks (any \n). Otherwise, all line breaks will be converted to spaces. Returns the modified string. =cut sub clean_whitespace { my $self = shift; $_ = shift; croak( "Odd number of parameters passed to format_text." ) if @_ % 2; my %args = @_; s/\r\n/\n/g; s/\r/\n/g; 1 while s/\n\n\n/\n\n/g; s/^[ \t\f]+//g; s/[ \t\f]+$//g; if ( $args{keep_all_breaks} ) { 1 while s/ / /g; } elsif ( $args{keep_full_breaks} ) { s/\n\n/\$\$\$/g; s/\n/ /g; 1 while s/ / /g; s/\$\$\$/\n\n/g; } else { s/\n/ /g; 1 while s/ / /g; } return $_; } =pod =head2 clean_whitespace_keep_full_breaks( $string ) Cleans up whitespace in HTML and plain text while preserving all full breaks (\n\n). Returns the modified string. =cut sub clean_whitespace_keep_full_breaks { my $self = shift; my $string = shift; return $self->clean_whitespace( $string, keep_full_breaks => 1 ); } =pod =head2 clean_whitespace_keep_all_breaks( $string ) Cleans up whitespace in HTML and plain text while preserving all line breaks (\n). Returns the modified string. =cut sub clean_whitespace_keep_all_breaks { my $self = shift; my $string = shift; return $self->clean_whitespace( $string, keep_all_breaks => 1 ); } =pod =head2 force_lc( $string ) Returns lc( $string ). =cut sub force_lc { my $self = shift; my $string = shift; return lc $string; } =pod =head2 force_uc( $string ) Returns uc( $string ). =cut sub force_uc { my $self = shift; my $string = shift; return uc $string; } =pod =head2 truncate( $string, $count ) Returns the first $count characters of string. =cut sub truncate { my $self = shift; my $string = shift; my $count = shift; if ( length( $string ) > $count ) { $string = substr( $string, 0, $count ); } return $string; } =pod =head2 truncate_with_ellipses( $string, $count ) Returns the first $count - 3 characters of string followed by '...'. =cut sub truncate_with_ellipses { my $self = shift; my $string = shift; my $count = shift; if ( $count > 3 ) { if ( length( $string ) > $count ) { $string = substr( $string, 0, ( $count - 3 ) ) . '...'; } } return $string; } =pod =head2 encode_xml( $string ) A copy of XML::Comma::Util::XML_basic_escape. Returns an XML-escaped string. =cut sub encode_xml { my $self = shift; my $string = shift; # escape & $string =~ s/\&/&/g; # escape < > $string =~ s//\>/g ; return $string; } =pod =head2 encode_html( $string ) Returns an HTML-escaped string. =cut sub encode_html { my $self = shift; my $string = shift; return HTML::Entities::encode( $string ); } =pod =head2 encode_uri( $string ) Returns an URI-escaped string. =cut sub encode_uri { my $self = shift; my $string = shift; return URI::Escape::uri_escape( $string ); } =pod =head2 reformat_date( $string, $oldformat, $newformat ) Takes a date string in $oldformat and returns a new string in $new_format. =cut sub reformat_date { my $self = shift; my $string = shift; my $oldformat = shift; my $newformat = shift; my $dt = $self->parse_date( $string, $oldformat ); return $self->format_date( $dt, $newformat ); } =pod =head2 parse_date( $string [, $format] ) Takes a $string representing a date and time, and tries to produce a valid DateTime object. Returns the object upon success, otherwise undef. Setting $string to 'now' creates a DateTime object of the current date and time. Setting $string to 'today' creates a DateTime object of today's date and time set to midnight. Otherwise, you must pass a $format to parse the string correctly. $format can be set to one of the following "shortcuts": 'date8', 'date14', or 'rfc822'. =cut sub parse_date { my $self = shift; my $string = shift; my $format = shift; return unless $string; if ( $string eq 'now' ) { return DateTime->now( time_zone => 'local' ); } if ( $string eq 'today' ) { return DateTime->today( time_zone => 'local' ); } return unless $format; $format = '%Y%m%d' if $format eq 'date8'; $format = '%Y%m%d%H%M%S' if $format eq 'date14'; $format = '%a, %d %b %Y %H:%M:%S %z' if $format eq 'rfc822'; if ( $format eq '%s' ) { return DateTime->from_epoch( epoch => $string, time_zone => 'local' ); } else { my $parser = DateTime::Format::Strptime->new( pattern => $format, on_error => 'undef', time_zone => 'local' ); return $parser->parse_datetime( $string ); } } =pod =head2 format_date( $dt, $format ) Takes a DateTime object ($dt) and a $format, and returns the formatted string. $format is a DateTime 'strftime' format string. $format can be set to one of the following "shortcuts": 'date8', 'date14', and 'rfc822'. =cut sub format_date { my $self = shift; my $dt = shift; my $format = shift; return unless ref $dt eq 'DateTime'; $format = '%Y%m%d' if $format eq 'date8'; $format = '%Y%m%d%H%M%S' if $format eq 'date14'; $format = '%a, %d %b %Y %H:%M:%S %z' if $format eq 'rfc822'; return $dt->strftime( $format ); } =pod =head1 AUTHOR Eric Folley, Eeric@folley.netE =head1 COPYRIGHT AND LICENSE Copyright 2004-2005 by Eric Folley This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut 1;