package Regexp::IgnoreTextCharacteristicsHTML; use Regexp::Ignore; our @ISA = ("Regexp::Ignore"); # inherit from Regexp::Ignore class ######################## # new ######################## sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = $class->SUPER::new(@_); $self->{IGNORE_HTML_REMARKS} = 1; # by default it ignores html remarks $self->{IGNORE_WORD_REMARKS} = 1; # by default it ignores word remarks # the tags to be ignored $self->{IGNORE_TAGS} = { B => 1, BASEFONT => 1, BIG => 1, BLINK => 1, CITE => 1, CODE => 1, EM => 1, FONT => 1, I => 1, KBD => 1, PLAINTEXT => 1, S => 1, SMALL => 1, STRIKE => 1, STRONG => 1, SUB => 1, SUP => 1, TT => 1, U => 1, VAR => 1, A => 1, SPAN => 1, WBR => 1 }; $self->build_regular_expressions(); return $self; } # of new ############################ # build_regular_expressions ############################ sub build_regular_expressions { my $self = shift; # the regular first expression will try to match: # - HTML remarks - all the remark will be matched. this will # clean out all the special tags of MSWord (that comes inside # remarks) # - MSWord remarks - starting with # - HTML tags my $re1 = '(<\/?[^\>]*?>)'; if ($self->{IGNORE_WORD_REMARKS}) { $re1 = '(<\!\[[^\]]*?\]>)|'.$re1; } if ($self->{IGNORE_HTML_REMARKS}) { $re1 = '(<\!\-\-.+?\-\->)|'.$re1; } $self->{RE1} = qr/$re1/is; # if the tag that we found is one of the following, it is unwanted # token. my $re2 = ""; if ($self->{IGNORE_HTML_REMARKS}) { $re2 = '(<\!\-\-.+?\-\->)|'; } if ($self->{IGNORE_WORD_REMARKS}) { $re2 .= '(<\!\[[^\]]*?\]>)|<\/?\s*[OVWXP]\:[^>]*?>|'; } foreach my $tag ($self->tags_to_ignore()) { $re2 .= '<\/?\s*'.$tag.'(\s[^>]*?>|\s*>)|'; } chop($re2); $self->{RE2} = qr/$re2/is; } # of build_regular_expressions ##################### # do_not_ignore ##################### sub do_not_ignore { my $self = shift; while (@_) { my $tag = shift; if (exists($self->{IGNORE_TAGS}{uc($tag)})) { $self->{IGNORE_TAGS}{uc($tag)} = 0; } } $self->build_regular_expressions(); } # of do_not_ignore ##################### # tags_to_ignore ##################### sub tags_to_ignore { my $self = shift; my $changed = 0; while (@_) { my $tag = shift; $changed = 1; $self->{IGNORE_TAGS}{uc($tag)} = 1; } if ($changed) { $self->build_regular_expressions(); } return unless defined (wantarray); # void context, do nothing my @tags_to_ignore = (); foreach my $tag (keys(% { $self->{IGNORE_TAGS} })) { if ($self->{IGNORE_TAGS}{$tag}) { push(@tags_to_ignore, $tag); } } return @tags_to_ignore; } # of tags_to_ignore ###################### # ignore_html_remarks ###################### sub ignore_html_remarks { my $self = shift; if (@_) { $self->{IGNORE_HTML_REMARKS} = shift; $self->build_regular_expressions(); } return $self->{IGNORE_HTML_REMARKS}; } # of ignore_html_remarks ###################### # ignore_word_remarks ###################### sub ignore_word_remarks { my $self = shift; if (@_) { $self->{IGNORE_WORD_REMARKS} = shift; $self->build_regular_expressions(); } return $self->{IGNORE_WORD_REMARKS}; } # of ignore_word_remarks ########################################################################## # Our get_tokens will treat any html tag that change the style of the # text as unwanted. It will also treat HTML remarks as unwanted. This # will let us parse HTML documents that were saved by MSWord - where # sometimes varibale_one becomes something like: # varibale_one. ######################## # get_tokens ######################## sub get_tokens { my $self = shift; my $tokens = []; my $flags = []; my $index = 0; # we should create tokens from the TEXT. my $text = $self->text(); # the regular expressions my $re1 = $self->{RE1}; my $re2 = $self->{RE2}; while (defined($text) && $text =~ /$re1/) { if (length($`)) { # if there is a text before, take it as clean $tokens->[$index] = $`; $flags->[$index] = 1; # the text before the match is clean. $index++; # increment the index } $tokens->[$index] = $&; # this is the match. it might be unwanted # or wanted, as you can see below. $text = $'; # update the original text to after the match. if ($tokens->[$index] =~ /$re2/) { $flags->[$index] = 0; # the match itself is unwanted. } else { $flags->[$index] = 1; # the match itself is ok. } $index++; # increment the index again } # if we had no match, check if there is still something in the # $text. this will be also a clean text. if (defined($text) && $text) { $tokens->[$index] = $text; $flags->[$index] = 1; } # return the two lists return ($tokens, $flags); } # of get_tokens 1; # make perl happy __END__ =head1 NAME Regexp::IgnoreTextCharacteristicsHTML - Let us ignore the HTML tags when parsing HTML text =head1 SYNOPSIS use Regexp::IgnoreTextCharacteristicsHTML; my $rei = new Regexp::IgnoreTextCharacteristicsHTML($text, ""); # split the wanted text from the unwanted text $rei->split(); # use substitution function $rei->s('(var)_(\d+)', '$2$1', 'gi'); $rei->s('(\d+):(\d+)', '$2:$1'); # merge back to get the resulted text my $changed_text = $rei->merge(); =head1 DESCRIPTION Inherit from B and implements the B method. The tokens that are returned by the B as unwanted are text characteristics HTML tags. To be specific, the tags: EBE, EBASEFONTE, EBIGE, EBLINKE, ECITEE, ECODEE, EEME, EFONTE, EIE, EKBDE, EPLAINTEXTE, ESE, ESMALLE, ESTRIKEE, ESTRONGE, ESUBE, ESUPE, ETTE, EUE, EVARE, EAE, ESPANE, and EWBRE. It will also take as unwanted tokens any HTML remarks and any remarks that MSWord creates when saving a document as HTML. However this behaviour can be changed using the class members IGNORE_HTML_REMARKS and IGNORE_WORD_REMARKS. =head1 ACCESS METHODS =over 4 =item ignore_html_remarks ( BOOLEAN ) If true (which is also the default), the B method will take the HTML remarks as unwanted tokens. So, any E!-- ... --E will be ignored. Should be called before B is called. =item ignore_word_remarks ( BOOLEAN ) If true (which is also the default), the B method will take the WORD remarks as unwanted tokens. So, any E![ ... ]E will be ignored. Should be called before B is called. =item do_not_ignore ( TAGS ) TAGS is a list of strings, each is a name of a tag. For example: ("B", "FONT") The tags that will be sent to this method, will not be ignored by the object. =item tags_to_ignore ( TAGS ) TAGS is a list of strings, each is a name of a tag. See B above, for example. The tags that are sent to this method will be ignored by the object. You can send already ignored tags, tags that were canceled by a call to B or totally new tags. All of them will be ignored. In a list context, it will return a list of all the tags that will be ignored. =back =head1 AUTHOR Rani Pinchuk, Erani@cpan.orgE =head1 COPYRIGHT Copyright (c) 2002 Ockham Technology N.V. & Rani Pinchuk. All rights reserved. This package is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L, L, L, L. =cut