<= 0 : Preformat entire document
1 : one line triggers
>= 2 : two lines trigger
(default: 2)
=item endpreformat_trigger_lines
endpreformat_trigger_lines=>I
How many lines of unpreformatted-looking text are needed to switch from
<= 0 : Never preformat within document
1 : one line triggers
>= 2 : two lines trigger
(default: 2)
NOTE for preformat_trigger_lines and endpreformat_trigger_lines:
A zero takes precedence. If one is zero, the other is ignored.
If both are zero, entire document is preformatted.
=item preformat_start_marker
preformat_start_marker=>I
What flags the start of a preformatted section if --use_preformat_marker
is true.
(default: "^(:?(:?<)|<)PRE(:?(:?>)|>)\$")
=item preformat_end_marker
preformat_end_marker=>I
What flags the end of a preformatted section if --use_preformat_marker
is true.
(default: "^(:?(:?<)|<)/PRE(:?(:?>)|>)\$")
=item preformat_whitespace_min
preformat_whitespace_min=>I
Minimum number of consecutive whitespace characters to trigger
normal preformatting.
NOTE: Tabs are expanded to spaces before this check is made.
That means if B is 8 and this is 5, then one tab may be
expanded to 8 spaces, which is enough to trigger preformatting.
(default: 5)
=item prepend_file
prepend_file=>I
If you want something prepended to the processed body text, put the
filename here. The prepended text will not be processed at all, so make
sure it's plain text or correct HTML.
(default: nothing)
=item preserve_indent
preserve_indent=>1
Preserve the first-line indentation of paragraphs marked with indents
by replacing the spaces of the first line with non-breaking spaces.
(default: false)
=item short_line_length
short_line_length=>I
Lines this short (or shorter) must be intentionally broken and are kept
that short.
(default: 40)
=item style_url
style_url=>I
This gives the URL of a stylesheet; a LINK tag will be added to the
output.
=item tab_width
tab_width=>I
How many spaces equal a tab?
(default: 8)
=item table_type
table_type=>{ ALIGN=>0, PGSQL=>0, BORDER=>1, DELIM=>0 }
This determines which types of tables will be recognised when "make_tables"
is true. The possible types are ALIGN, PGSQL, BORDER and DELIM.
(default: all types are true)
=item title
title=>I
You can specify a title. Otherwise it will use a blank one.
(default: nothing)
=item titlefirst
titlefirst=>1
Use the first non-blank line as the title. (See also "title")
=item underline_length_tolerance
underline_length_tolerance=>I
How much longer or shorter can underlines be and still be underlines?
(default: 1)
=item underline_offset_tolerance
underline_offset_tolerance=>I
How far offset can underlines be and still be underlines?
(default: 1)
=item unhyphenation
unhyphenation=>0
Enables unhyphenation of text.
(default: true)
=item use_mosaic_header
use_mosaic_header=>1
Use this option if you want to force the heading styles to match what Mosaic
outputs. (Underlined with "***"s is H1,
with "==="s is H2, with "+++" is H3, with "---" is H4, with "~~~" is H5
and with "..." is H6)
This was the behavior of txt2html up to version 1.10.
(default: false)
=item use_preformat_marker
use_preformat_marker=>1
Turn on preformatting when encountering "" on a line by itself, and turn
it off when there's a line containing only "
".
When such preformatted text is detected, the PRE tag will be given the
class 'quote_explicit'.
(default: off)
=item xhtml
xhtml=>1
Try to make the output conform to the XHTML standard, including
closing all open tags and marking empty tags correctly. This
turns on --lower_case_tags and overrides the --doctype option.
Note that if you add a header or a footer file, it is up to you
to make it conform; the header/footer isn't touched by this.
Likewise, if you make link-dictionary entries that break XHTML,
then this won't fix them, except to the degree of putting all tags
into lower-case.
(default: true)
=back
=head1 DEBUGGING
There are global variables for setting types and levels
of debugging. These should only be used by developers.
=over
=item $HTML::TextToHTML::Debug
$HTML::TextToHTML::Debug = 1;
Enable copious debugging output.
(default: false)
=item $HTML::TextToHTML::DictDebug
$HTML::TextToHTML::DictDebug = I;
Debug mode for link dictionaries. Bitwise-Or what you want to see:
1: The parsing of the dictionary
2: The code that will make the links
4: When each rule matches something
8: When each tag is created
(default: 0)
=back
=cut
our $Debug = 0;
our $DictDebug = 0;
=head1 METHODS
=cut
#------------------------------------------------------------------------
use YAML::Syck;
our $PROG = 'HTML::TextToHTML';
#------------------------------------------------------------------------
########################################
# Definitions (Don't change these)
#
# These are just constants I use for making bit vectors to keep track
# of what modes I'm in and what actions I've taken on the current and
# previous lines.
our $NONE = 0;
our $LIST = 1;
our $HRULE = 2;
our $PAR = 4;
our $PRE = 8;
our $END = 16;
our $BREAK = 32;
our $HEADER = 64;
our $MAILHEADER = 128;
our $MAILQUOTE = 256;
our $CAPS = 512;
our $LINK = 1024;
our $PRE_EXPLICIT = 2048;
our $TABLE = 4096;
our $IND_BREAK = 8192;
our $LIST_START = 16384;
our $LIST_ITEM = 32768;
# Constants for Link-processing
# bit-vectors for what to do with a particular link-dictionary entry
our $LINK_NOCASE = 1;
our $LINK_EVAL = 2;
our $LINK_HTML = 4;
our $LINK_ONCE = 8;
our $LINK_SECT_ONCE = 16;
# Constants for Ordered Lists and Unordered Lists.
# And Definition Lists.
# I use this in the list stack to keep track of what's what.
our $OL = 1;
our $UL = 2;
our $DL = 3;
# Constants for table types
our $TAB_ALIGN = 1;
our $TAB_PGSQL = 2;
our $TAB_BORDER = 3;
our $TAB_DELIM = 4;
# Constants for tags
use constant {
TAG_START => 1,
TAG_END => 2,
TAG_EMPTY => 3,
};
# Character entity names
# characters to replace with entities
our %char_entities = (
"\241", "¡", "\242", "¢", "\243", "£",
"\244", "¤", "\245", "¥", "\246", "¦",
"\247", "§", "\250", "¨", "\251", "©",
"\252", "ª", "\253", "«", "\254", "¬",
"\255", "", "\256", "®", "\257", "&hibar;",
"\260", "°", "\261", "±", "\262", "²",
"\263", "³", "\264", "´", "\265", "µ",
"\266", "¶", "\270", "¸", "\271", "¹",
"\272", "º", "\273", "»", "\274", "¼",
"\275", "½", "\276", "¾", "\277", "¿",
"\300", "À", "\301", "Á", "\302", "Â",
"\303", "Ã", "\304", "Ä", "\305", "Å",
"\306", "Æ", "\307", "Ç", "\310", "È",
"\311", "É", "\312", "Ê", "\313", "Ë",
"\314", "Ì", "\315", "Í", "\316", "Î",
"\317", "Ï", "\320", "Ð", "\321", "Ñ",
"\322", "Ò", "\323", "Ó", "\324", "Ô",
"\325", "Õ", "\326", "Ö", "\327", "×",
"\330", "Ø", "\331", "Ù", "\332", "Ú",
"\333", "Û", "\334", "Ü", "\335", "Ý",
"\336", "Þ", "\337", "ß", "\340", "à",
"\341", "á", "\342", "â", "\343", "ã",
"\344", "ä", "\345", "å", "\346", "æ",
"\347", "ç", "\350", "è", "\351", "é",
"\352", "ê", "\353", "ë", "\354", "ì",
"\355", "í", "\356", "î", "\357", "ï",
"\360", "ð", "\361", "ñ", "\362", "ò",
"\363", "ó", "\364", "ô", "\365", "õ",
"\366", "ö", "\367", "÷", "\370", "ø",
"\371", "ù", "\372", "ú", "\373", "û",
"\374", "ü", "\375", "ý", "\376", "þ",
"\377", "ÿ", "\267", "·",
);
# alignments for tables
our @alignments = ('', '', ' ALIGN="RIGHT"', ' ALIGN="CENTER"');
our @lc_alignments = ('', '', ' align="right"', ' align="center"');
our @xhtml_alignments =
('', '', ' style="text-align: right;"', ' style="text-align: center;"');
#---------------------------------------------------------------#
# Object interface
#---------------------------------------------------------------#
=head2 new
$conv = new HTML::TextToHTML()
$conv = new HTML::TextToHTML(titlefirst=>1,
...
);
Create a new object with new. If arguments are given, these arguments
will be used in invocations of other methods.
See L for the possible values of the arguments.
=cut
sub new
{
my $invocant = shift;
my $self = {};
my $class = ref($invocant) || $invocant; # Object or class name
init_our_data($self);
# bless self
bless($self, $class);
$self->args(@_);
return $self;
} # new
=head2 args
$conv->args(short_line_length=>60,
titlefirst=>1,
....
);
Updates the current arguments/options of the HTML::TextToHTML object.
Takes hash of arguments, which will be used in invocations of other
methods.
See L for the possible values of the arguments.
=cut
sub args
{
my $self = shift;
my %args = @_;
if (%args)
{
if ($Debug)
{
print STDERR "========args(hash)========\n";
print STDERR Dump(%args);
}
my $arg;
my $val;
while (($arg, $val) = each %args)
{
if (defined $val)
{
if ($arg =~ /^-/)
{
$arg =~ s/^-//; # get rid of first dash
$arg =~ s/^-//; # get rid of possible second dash
}
if ($Debug)
{
print STDERR "--", $arg;
}
$self->{$arg} = $val;
if ($Debug)
{
print STDERR " ", $val, "\n";
}
}
}
}
$self->deal_with_options();
if ($Debug)
{
print STDERR Dump($self);
}
return 1;
} # args
=head2 process_chunk
$newstring = $conv->process_chunk($mystring);
Convert a string to a HTML fragment. This assumes that this string is
at the least, a single paragraph, but it can contain more than that.
This returns the processed string. If you want to pass arguments to
alter the behaviour of this conversion, you need to do that earlier,
either when you create the object, or with the L method.
$newstring = $conv->process_chunk($mystring,
close_tags=>0);
If there are open tags (such as lists) in the input string,
process_chunk will automatically close them, unless you specify not
to, with the close_tags option.
$newstring = $conv->process_chunk($mystring,
is_fragment=>1);
If you want this string to be treated as a fragment, and not assumed to
be a paragraph, set is_fragment to true. If there is more than one
paragraph in the string (ie it contains blank lines) then this option
will be ignored.
=cut
sub process_chunk ($$;%)
{
my $self = shift;
my $chunk = shift;
my %args = (
close_tags => 1,
is_fragment => 0,
@_
);
my $ret_str = '';
my @paras = split(/\r?\n\r?\n/, $chunk);
my $ind = 0;
if (@paras == 1) # just one paragraph
{
$ret_str .= $self->process_para(
$chunk,
close_tags => $args{close_tags},
is_fragment => $args{is_fragment}
);
}
else
{
my $ind = 0;
foreach my $para (@paras)
{
# if the paragraph doesn't end with a newline, add one
$para .= "\n" if ($para !~ /\n$/);
if ($ind == @paras - 1) # last one
{
$ret_str .= $self->process_para(
$para,
close_tags => $args{close_tags},
is_fragment => 0
);
}
else
{
$ret_str .= $self->process_para(
$para,
close_tags => 0,
is_fragment => 0
);
}
$ind++;
}
}
$ret_str;
} # process_chunk
=head2 process_para
$newstring = $conv->process_para($mystring);
Convert a string to a HTML fragment. This assumes that this string is
at the most a single paragraph, with no blank lines in it. If you don't
know whether your string will contain blank lines or not, use the
L method instead.
This returns the processed string. If you want to pass arguments to
alter the behaviour of this conversion, you need to do that earlier,
either when you create the object, or with the L method.
$newstring = $conv->process_para($mystring,
close_tags=>0);
If there are open tags (such as lists) in the input string, process_para
will automatically close them, unless you specify not to, with the
close_tags option.
$newstring = $conv->process_para($mystring,
is_fragment=>1);
If you want this string to be treated as a fragment, and not assumed to be
a paragraph, set is_fragment to true.
=cut
sub process_para ($$;%)
{
my $self = shift;
my $para = shift;
my %args = (
close_tags => 1,
is_fragment => 0,
@_
);
# if this is an external call, do certain initializations
$self->do_init_call();
my $para_action = $NONE;
# tables and mailheaders don't carry over from one para to the next
if ($self->{__mode} & $TABLE)
{
$self->{__mode} ^= $TABLE;
}
if ($self->{__mode} & $MAILHEADER)
{
$self->{__mode} ^= $MAILHEADER;
}
# convert Microsoft character codes into sensible characters
if ($self->{demoronize})
{
demoronize_char($para);
}
# if we are not just linking, we are discerning structure
if (!$self->{link_only})
{
# Chop trailing whitespace and DOS CRs
$para =~ s/[ \011]*\015$//;
# Chop leading whitespace and DOS CRs
$para =~ s/^[ \011]*\015//;
$para =~ s/\r//g; # remove any stray carriage returns
my @done_lines = (); # lines which have been processed
# The PRE_EXPLICIT structure can carry over from one
# paragraph to the next, but it is ended with the
# explicit end-tag designated for it.
# Therefore we can shortcut for this by checking
# for the end of the PRE_EXPLICIT and chomping off
# the preformatted string part of this para before
# we have to split it into lines.
# Note that after this check, we could *still* be
# in PRE_EXPLICIT mode.
if ($self->{__mode} & $PRE_EXPLICIT)
{
my $pre_str =
$self->split_end_explicit_preformat(para_ref => \$para);
if ($pre_str)
{
push @done_lines, $pre_str;
}
}
if (defined $para && $para ne "")
{
#
# Now we split the paragraph into lines
#
my $para_len = length($para);
my @para_lines = split(/^/, $para);
my @para_line_len = ();
my @para_line_indent = ();
my @para_line_action = ();
my $i = 0;
foreach my $line (@para_lines)
{
# Change all tabs to spaces
while ($line =~ /\011/)
{
my $tw = $self->{tab_width};
$line =~ s/\011/" " x ($tw - (length($`) % $tw))/e;
}
push @para_line_len, length($line);
if ($line =~ /^\s*$/)
{
# if the line is blank, use the previous indent
# if there is one
push @para_line_indent,
($i == 0 ? 0 : $para_line_indent[$i - 1]);
}
else
{
# count the number of leading spaces
my ($ws) = $line =~ /^( *)[^ ]/;
push @para_line_indent, length($ws);
}
push @para_line_action, $NONE;
$i++;
}
# There are two more structures which carry over from one
# paragraph to the next: LIST, PRE
# There are also certain things which will immediately end
# multi-paragraph LIST and PRE, if found at the start
# of a paragraph:
# A list will be ended by
# TABLE, MAILHEADER, HEADER, custom-header
# A PRE will be ended by
# TABLE, MAILHEADER and non-pre text
my $is_table = 0;
my $table_type = 0;
my $is_mailheader = 0;
my $is_header = 0;
my $is_custom_header = 0;
if (@{$self->{custom_heading_regexp}})
{
$is_custom_header =
$self->is_custom_heading(line => $para_lines[0]);
}
if ( $self->{make_tables}
&& @para_lines > 1)
{
$table_type = $self->get_table_type(
rows_ref => \@para_lines,
para_len => $para_len
);
$is_table = ($table_type != 0);
}
if ( !$self->{explicit_headings}
&& @para_lines > 1
&& !$is_table)
{
$is_header = $self->is_heading(
line_ref => \$para_lines[0],
next_ref => \$para_lines[1]
);
}
# Note that it is concievable that someone has
# partially disabled mailmode by making a custom header
# which matches the start of mail.
# This is stupid, but allowable, so we check.
if ( $self->{mailmode}
&& !$is_table
&& !$is_custom_header)
{
$is_mailheader = $self->is_mailheader(rows_ref => \@para_lines);
}
# end the list if we can end it
if (
($self->{__mode} & $LIST)
&& ( $is_table
|| $is_mailheader
|| $is_header
|| $is_custom_header)
)
{
my $list_end = '';
my $action = 0;
$self->endlist(
num_lists => $self->{__listnum},
prev_ref => \$list_end,
line_action_ref => \$action
);
push @done_lines, $list_end;
$self->{__prev_para_action} |= $END;
}
# end the PRE if we can end it
if (
($self->{__mode} & $PRE)
&& !($self->{__mode} & $PRE_EXPLICIT)
&& ( $is_table
|| $is_mailheader
|| !$self->is_preformatted($para_lines[0]))
&& ($self->{preformat_trigger_lines} != 0)
)
{
my $pre_end = '';
my $tag = $self->close_tag('pre');
$pre_end = "${tag}\n";
$self->{__mode} ^= ($PRE & $self->{__mode});
push @done_lines, $pre_end;
$self->{__prev_para_action} |= $END;
}
# The PRE and PRE_EXPLICIT structure can carry over
# from one paragraph to the next, but because we don't
# want trailing newlines, such newlines would have been
# gotten rid of in the previous call. However, with
# a preformatted text, we do want the blank lines in it
# to be preserved, so let's add a blank line in here.
if ($self->{__mode} & $PRE)
{
push @done_lines, "\n";
}
# Now, we do certain things which are only found at the
# start of a paragraph:
# HEADER, custom-header, TABLE and MAILHEADER
# These could concievably eat the rest of the paragraph.
if ($is_custom_header)
{
# custom header eats the first line
my $header = shift @para_lines;
shift @para_line_len;
shift @para_line_indent;
shift @para_line_action;
$self->custom_heading(line_ref => \$header);
push @done_lines, $header;
$self->{__prev_para_action} |= $HEADER;
}
elsif ($is_header)
{
# normal header eats the first two lines
my $header = shift @para_lines;
shift @para_line_len;
shift @para_line_indent;
shift @para_line_action;
my $underline = shift @para_lines;
shift @para_line_len;
shift @para_line_indent;
shift @para_line_action;
$self->heading(
line_ref => \$header,
next_ref => \$underline
);
push @done_lines, $header;
$self->{__prev_para_action} |= $HEADER;
}
# do the table stuff on the array of lines
if ($self->{make_tables} && $is_table)
{
if (
$self->tablestuff(
table_type => $table_type,
rows_ref => \@para_lines,
para_len => $para_len
)
)
{
# this has used up all the lines
push @done_lines, @para_lines;
@para_lines = ();
}
}
# check of this para is a mail-header
if ( $is_mailheader
&& !($self->{__mode} & $TABLE)
&& @para_lines)
{
$self->mailheader(rows_ref => \@para_lines);
# this has used up all the lines
push @done_lines, @para_lines;
@para_lines = ();
}
#
# Now go through the paragraph lines one at a time
# Note that we won't have TABLE, MAILHEADER, HEADER modes
# because they would have eaten the lines
#
my $prev = '';
my $prev_action = $self->{__prev_para_action};
for (my $i = 0; $i < @para_lines; $i++)
{
my $prev_ref;
my $prev_action_ref;
my $prev_line_indent;
my $prev_line_len;
if ($i == 0)
{
$prev_ref = \$prev;
$prev_action_ref = \$prev_action;
$prev_line_indent = 0;
$prev_line_len = 0;
}
else
{
$prev_ref = \$para_lines[$i - 1];
$prev_action_ref = \$para_line_action[$i - 1];
$prev_line_indent = $para_line_indent[$i - 1];
$prev_line_len = $para_line_len[$i - 1];
}
my $next_ref;
if ($i == $#para_lines)
{
$next_ref = undef;
}
else
{
$next_ref = \$para_lines[$i + 1];
}
$para_lines[$i] = escape($para_lines[$i])
if ($self->{escape_HTML_chars});
if ($self->{mailmode}
&& !($self->{__mode} & ($PRE_EXPLICIT)))
{
$self->mailquote(
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i],
prev_ref => $prev_ref,
prev_action_ref => $prev_action_ref,
next_ref => $next_ref
);
}
if ( ($self->{__mode} & $PRE)
&& ($self->{preformat_trigger_lines} != 0))
{
$self->endpreformat(
para_lines_ref => \@para_lines,
para_action_ref => \@para_line_action,
ind => $i,
prev_ref => $prev_ref
);
}
if (!($self->{__mode} & $PRE))
{
$self->hrule(
para_lines_ref => \@para_lines,
para_action_ref => \@para_line_action,
ind => $i
);
}
if (!($self->{__mode} & ($PRE))
&& ($para_lines[$i] !~ /^\s*$/))
{
$self->liststuff(
para_lines_ref => \@para_lines,
para_action_ref => \@para_line_action,
para_line_indent_ref => \@para_line_indent,
ind => $i,
prev_ref => $prev_ref
);
}
if ( !($para_line_action[$i] & ($HEADER | $LIST))
&& !($self->{__mode} & ($LIST | $PRE))
&& $self->{__preformat_enabled})
{
$self->preformat(
mode_ref => \$self->{__mode},
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i],
prev_ref => $prev_ref,
next_ref => $next_ref,
prev_action_ref => $prev_action_ref
);
}
if (!($self->{__mode} & ($PRE)))
{
$self->paragraph(
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i],
prev_ref => $prev_ref,
prev_action_ref => $prev_action_ref,
line_indent => $para_line_indent[$i],
prev_indent => $prev_line_indent,
is_fragment => $args{is_fragment},
ind => $i,
);
}
if (!($self->{__mode} & ($PRE | $LIST)))
{
$self->shortline(
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i],
prev_ref => $prev_ref,
prev_action_ref => $prev_action_ref,
prev_line_len => $prev_line_len
);
}
if (!($self->{__mode} & ($PRE)))
{
$self->caps(
line_ref => \$para_lines[$i],
line_action_ref => \$para_line_action[$i]
);
}
# put the "prev" line in front of the first line
$para_lines[$i] = $prev . $para_lines[$i]
if ($i == 0 && ($prev !~ /^\s*$/));
}
# para action is the action of the last line of the para
$para_action = $para_line_action[$#para_line_action];
$para_action = $NONE if (!defined $para_action);
# push them on the done lines
push @done_lines, @para_lines;
@para_lines = ();
}
# now put the para back together as one string
$para = join('', @done_lines);
# if this is a paragraph, and we are in XHTML mode,
# close an open paragraph.
if ($self->{xhtml})
{
my $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
if (defined $open_tag && $open_tag eq 'p')
{
$para .= $self->close_tag('p');
}
}
if (
$self->{unhyphenation}
# ends in hyphen & next line starts w/letters
&& ($para =~ /[^\W\d_]\-\n\s*[^\W\d_]/s) && !(
$self->{__mode} &
($PRE | $HEADER | $MAILHEADER | $TABLE | $BREAK)
)
)
{
$self->unhyphenate_para(\$para);
}
# chop trailing newlines for continuing lists and PRE
if ( $self->{__mode} & $LIST
|| $self->{__mode} & $PRE)
{
$para =~ s/\n$//g;
}
}
# apply links and bold/italic/underline formatting
if ($para !~ /^\s*$/)
{
$self->apply_links(
para_ref => \$para,
para_action_ref => \$para_action
);
}
# close any open lists if required to
if ( $args{close_tags}
&& $self->{__mode} & $LIST) # End all lists
{
$self->endlist(
num_lists => $self->{__listnum},
prev_ref => \$para,
line_action_ref => \$para_action
);
}
# close any open tags
if ($args{close_tags} && $self->{xhtml})
{
while (@{$self->{__tags}})
{
$para .= $self->close_tag('');
}
}
# convert remaining Microsoft character codes into sensible HTML
if ($self->{demoronize} && !$self->{eight_bit_clean})
{
$para = demoronize_code($para);
}
# All the matching and formatting is done. Now we can
# replace non-ASCII characters with character entities.
if (!$self->{eight_bit_clean})
{
my @chars = split(//, $para);
foreach $_ (@chars)
{
$_ = $char_entities{$_} if defined($char_entities{$_});
}
$para = join('', @chars);
}
$self->{__prev_para_action} = $para_action;
return $para;
} # process_para
=head2 txt2html
$conv->txt2html(%args);
Convert a text file to HTML. Takes a hash of arguments. See
L for the possible values of the arguments. Arguments which
have already been set with B or B will remain as they are,
unless they are overridden.
=cut
sub txt2html ($;$)
{
my $self = shift;
if (@_)
{
$self->args(@_);
}
$self->do_init_call();
my $outhandle;
my $outhandle_needs_closing;
# set up the output
if ($self->{outhandle})
{
$outhandle = $self->{outhandle};
$outhandle_needs_closing = 1;
}
elsif ($self->{outfile} eq "-")
{
$outhandle = *STDOUT;
$outhandle_needs_closing = 0;
}
else
{
open($outhandle, "> " . $self->{outfile})
|| die "Error: unable to open ", $self->{outfile}, ": $!\n";
$outhandle_needs_closing = 1;
}
# slurp up a paragraph at a time, a file at a time
local $/ = "";
my $para = '';
my $count = 0;
my $print_count = 0;
my @sources = ();
my $source_type;
if ($self->{infile} and @{$self->{infile}})
{
@sources = @{$self->{infile}};
$source_type = 'file';
}
elsif ($self->{inhandle} and @{$self->{inhandle}})
{
@sources = @{$self->{inhandle}};
$source_type = 'filehandle';
}
elsif ($self->{instring} and @{$self->{instring}})
{
@sources = @{$self->{instring}};
$source_type = 'string';
}
my $inhandle;
my $inhandle_needs_closing = 0;
foreach my $source (@sources)
{
$inhandle = undef;
if ($source_type eq 'file')
{
if (!$source or $source eq '-')
{
$inhandle = *STDIN;
$inhandle_needs_closing = 0;
}
else
{
if (-f $source && open($inhandle, $source))
{
$inhandle_needs_closing = 1;
}
else # error
{
warn "Could not open $source\n";
next;
}
}
}
elsif ($source_type eq 'filehandle')
{
$inhandle = $source;
$inhandle_needs_closing = 1;
}
if ($source_type eq 'string')
{
# process the string
$para = $_;
$para =~ s/\n$//; # trim the endline
if ($count == 0)
{
$self->do_file_start($outhandle, $para);
}
$self->{__done_with_sect_link} = [];
$para = $self->process_chunk($para, close_tags => 0);
print $outhandle $para, "\n";
$print_count++;
$count++;
}
else # file or filehandle
{
while (<$inhandle>)
{
$para = $_;
$para =~ s/\n$//; # trim the endline
if ($count == 0)
{
$self->do_file_start($outhandle, $para);
}
$self->{__done_with_sect_link} = [];
$para = $self->process_chunk($para, close_tags => 0);
print $outhandle $para, "\n";
$print_count++;
$count++;
}
if ($inhandle_needs_closing)
{
close($inhandle);
}
}
} # for each file
$self->{__prev} = "";
if ($self->{__mode} & $LIST) # End all lists
{
$self->endlist(
num_lists => $self->{__listnum},
prev_ref => \$self->{__prev},
line_action_ref => \$self->{__line_action}
);
}
print $outhandle $self->{__prev};
# end open preformats
if ($self->{__mode} & $PRE)
{
my $tag = $self->close_tag('pre');
print $outhandle $tag;
}
# close all open tags
if ( $self->{xhtml}
&& !$self->{extract}
&& @{$self->{__tags}})
{
if ($DictDebug & 8)
{
print STDERR "closing all tags at end\n";
}
# close any open tags (until we get to the body)
my $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
while (@{$self->{__tags}}
&& $open_tag ne 'body'
&& $open_tag ne 'html')
{
print $outhandle $self->close_tag('');
$open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
}
print $outhandle "\n";
}
if ($self->{append_file})
{
if (-r $self->{append_file})
{
open(APPEND, $self->{append_file});
while ()
{
print $outhandle $_;
$print_count++;
}
close(APPEND);
}
else
{
print STDERR "Can't find or read file ", $self->{append_file},
" to append.\n";
}
}
# print the closing tags (if we have printed stuff at all)
if ($print_count && !$self->{extract})
{
print $outhandle $self->close_tag('body'), "\n";
print $outhandle $self->close_tag('html'), "\n";
}
if ($outhandle_needs_closing)
{
close($outhandle);
}
return 1;
}
=head1 PRIVATE METHODS
These are methods used internally, only of interest to developers.
=cut
#---------------------------------------------------------------#
# Init-related subroutines
=head2 init_our_data
$self->init_our_data();
Initializes the internal object data.
=cut
sub init_our_data ($)
{
my $self = shift;
#
# All the options, in alphabetical order
#
$self->{append_file} = '';
$self->{append_head} = '';
$self->{body_deco} = '';
$self->{bullets} = '-=o*\267';
$self->{bullets_ordered} = '';
$self->{bold_delimiter} = '#';
$self->{caps_tag} = 'STRONG';
$self->{custom_heading_regexp} = [];
$self->{default_link_dict} =
($ENV{HOME} ? "$ENV{HOME}/.txt2html.dict" : '.txt2html.dict');
$self->{doctype} = '-//W3C//DTD HTML 4.01//EN"
"http://www.w3.org/TR/html4/strict.dtd';
$self->{demoronize} = 1;
$self->{eight_bit_clean} = 0;
$self->{escape_HTML_chars} = 1;
$self->{explicit_headings} = 0;
$self->{extract} = 0;
$self->{hrule_min} = 4;
$self->{indent_width} = 2;
$self->{indent_par_break} = 0;
$self->{infile} = [];
$self->{inhandle} = [];
$self->{instring} = [];
$self->{italic_delimiter} = '*';
$self->{links_dictionaries} = [];
$self->{link_only} = 0;
$self->{lower_case_tags} = 0;
$self->{mailmode} = 0;
$self->{make_anchors} = 1;
$self->{make_links} = 1;
$self->{make_tables} = 0;
$self->{min_caps_length} = 3;
$self->{outfile} = '-';
$self->{par_indent} = 2;
$self->{preformat_trigger_lines} = 2;
$self->{endpreformat_trigger_lines} = 2;
$self->{preformat_start_marker} = "^(:?(:?<)|<)PRE(:?(:?>)|>)\$";
$self->{preformat_end_marker} = "^(:?(:?<)|<)/PRE(:?(:?>)|>)\$";
$self->{preformat_whitespace_min} = 5;
$self->{prepend_file} = '';
$self->{preserve_indent} = 0;
$self->{short_line_length} = 40;
$self->{style_url} = '';
$self->{tab_width} = 8;
$self->{table_type} = {
ALIGN => 1,
PGSQL => 1,
BORDER => 1,
DELIM => 1,
};
$self->{title} = '';
$self->{titlefirst} = 0;
$self->{underline_delimiter} = '_';
$self->{underline_length_tolerance} = 1;
$self->{underline_offset_tolerance} = 1;
$self->{unhyphenation} = 1;
$self->{use_mosaic_header} = 0;
$self->{use_preformat_marker} = 0;
$self->{xhtml} = 1;
# accumulation variables
$self->{__file} = ""; # Current file being processed
$self->{__heading_styles} = {};
$self->{__num_heading_styles} = 0;
$self->{__links_table} = {};
$self->{__links_table_order} = [];
$self->{__links_table_patterns} = {};
$self->{__search_patterns} = [];
$self->{__repl_code} = [];
$self->{__prev_para_action} = 0;
$self->{__non_header_anchor} = 0;
$self->{__mode} = 0;
$self->{__listnum} = 0;
$self->{__list_nice_indent} = "";
$self->{__list_indent} = [];
$self->{__call_init_done} = 0;
#
# The global links data
#
my $system_dict = <<'EOT';
#
# Global links dictionary file for HTML::TextToHTML
# http://www.katspace.com/tools/text_to_html
# http://txt2html.sourceforge.net/
# based on links dictionary for Seth Golub's txt2html
# http://www.aigeek.com/txt2html/
#
# This dictionary contains some patterns for converting obvious URLs,
# ftp sites, hostnames, email addresses and the like to hrefs.
#
# Original adapted from the html.pl package by Oscar Nierstrasz in
# the Software Archive of the Software Composition Group
# http://iamwww.unibe.ch/~scg/Src/
#
# Some people even like to mark the URL label explicitly
/<URL:([-\w\.\/:~_\@]+):([a-zA-Z0-9'() ]+)>/ -h-> $2
# Some people like to mark URLs explicitly
/<URL:\s*(\S+?)\s*>/ -h-> $1
#
/<(http:\S+?)\s*>/ -h-> <$1>
# Urls: :
|snews:[\w\.]+| -> $&
|news:[\w\.]+| -> $&
|nntp:[\w/\.:+\-]+| -> $&
|http:[\w/\.:\@+\-~\%#?=&;,]+[\w/]| -> $&
|shttp:[\w/\.:+\-~\%#?=&;,]+| -> $&
|https:[\w/\.:+\-~\%#?=&;,]+| -> $&
|file:[\w/\.:+\-]+| -> $&
|ftp:[\w/\.:+\-]+| -> $&
|wais:[\w/\.:+\-]+| -> $&
|gopher:[\w/\.:+\-]+| -> $&
|telnet:[\w/\@\.:+\-]+| -> $&
# catch some newsgroups to avoid confusion with sites:
|([^\w\-/\.:\@>])(alt\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(bionet\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(bit\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(biz\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(clari\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(comp\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(gnu\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(humanities\.[\w\.+\-]+[\w+\-]+)|
-h-> $1$2
|([^\w\-/\.:\@>])(k12\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(misc\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(news\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(rec\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(soc\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(talk\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(us\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(ch\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
|([^\w\-/\.:\@>])(de\.[\w\.+\-]+[\w+\-]+)| -h-> $1$2
# FTP locations (with directory):
# anonymous@:
|(anonymous\@)([[:alpha:]][\w\.+\-]+\.[[:alpha:]]{2,}):(\s*)([\w\d+\-/\.]+)|
-h-> $1$2:$4$3
# ftp@:
|(ftp\@)([[:alpha:]][\w\.+\-]+\.[[:alpha:]]{2,}):(\s*)([\w\d+\-/\.]+)|
-h-> $1$2:$4$3
# Email address
|[[:alnum:]_\+\-\.]+\@([[:alnum:]][\w\.+\-]+\.[[:alpha:]]{2,})|
-> mailto:$&
# :
|([^\w\-/\.:\@>])([[:alpha:]][\w\.+\-]+\.[[:alpha:]]{2,}):(\s*)([\w\d+\-/\.]+)|
-h-> $1$2:$4$3
# NB: don't confuse an http server with a port number for
# an FTP location!
# internet number version: :
|([^\w\-/\.:\@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/\.]+)|
-h-> $1$2:$3
# telnet
|telnet ([[:alpha:]][\w+\-]+(\.[\w\.+\-]+)+\.[[:alpha:]]{2,})\s+(\d{2,4})|
-h-> telnet $1 $3
# ftp
|ftp ([[:alpha:]][\w+\-]+(\.[\w\.+\-]+)+\.[[:alpha:]]{2,})|
-h-> ftp $1
# host with "ftp" in the machine name
|\b([[:alpha:]][\w])*ftp[\w]*(\.[\w+\-]+){2,}| -h-> ftp $&
# ftp.foo.net/blah/
|ftp(\.[\w\@:-]+)+/\S+| -> ftp://$&
# www.thehouse.org/txt2html/
|www(\.[\w\@:-]+)+/\S+| -> http://$&
# host with "www" in the machine name
|\b([[:alpha:]][\w])*www[\w]*(\.[\w+\-]+){2,}| -> http://$&/
#
|([[:alpha:]][\w+\-]+\.[\w+\-]+\.[[:alpha:]]{2,})\s+(\d{2,4})|
-h-> $1 $2
# just internet numbers with port:
|([^\w\-/\.:\@])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s+(\d{1,4})|
-h-> $1$2 $3
# just internet numbers:
|([^\w\-/\.:\@])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})|
-h-> $1$2
# RFCs
/RFC ?(\d+)/ -i-> http://www.cis.ohio-state.edu/rfc/rfc$1.txt
# Mark _underlined stuff_ as underlined stuff
# Don't mistake variable names for underlines, and
# take account of possible trailing punctuation
#/([ \t\n])_([[:alpha:]][[:alnum:]\s-]*[[:alpha:]])_([\s\.;:,\!\?])/ -h-> $1$2$3
# Seth and his amazing conversion program :-)
"Seth Golub" -o-> http://www.aigeek.com/
"txt2html" -o-> http://txt2html.sourceforge.net/
# Kathryn and her amazing modules 8-)
"Kathryn Andersen" -o-> http://www.katspace.com/
"HTML::TextToHTML" -o-> http://www.katspace.com/tools/text_to_html/
"hypertoc" -o-> http://www.katspace.com/tools/hypertoc/
"HTML::GenToc" -o-> http://www.katspace.com/tools/hypertoc/
# End of global dictionary
EOT
# pre-parse the above data by removing unwanted lines
# skip lines that start with '#'
$system_dict =~ s/^\#.*$//mg;
# skip lines that end with unescaped ':'
$system_dict =~ s/^.*[^\\]:\s*$//mg;
$self->{__global_links_data} = $system_dict;
} # init_our_data
#---------------------------------------------------------------#
# txt2html-related subroutines
=head2 deal_with_options
$self->deal_with_options();
do extra processing related to particular options
=cut
sub deal_with_options ($)
{
my $self = shift;
if (!$self->{make_links})
{
$self->{'links_dictionaries'} = 0;
}
if ($self->{append_file})
{
if (!-r $self->{append_file})
{
print STDERR "Can't find or read ", $self->{append_file}, "\n";
$self->{append_file} = '';
}
}
if ($self->{prepend_file})
{
if (!-r $self->{prepend_file})
{
print STDERR "Can't find or read ", $self->{prepend_file}, "\n";
$self->{'prepend_file'} = '';
}
}
if ($self->{append_head})
{
if (!-r $self->{append_head})
{
print STDERR "Can't find or read ", $self->{append_head}, "\n";
$self->{'append_head'} = '';
}
}
if (!$self->{outfile})
{
$self->{'outfile'} = "-";
}
$self->{'preformat_trigger_lines'} = 0
if ($self->{preformat_trigger_lines} < 0);
$self->{'preformat_trigger_lines'} = 2
if ($self->{preformat_trigger_lines} > 2);
$self->{'endpreformat_trigger_lines'} = 1
if ($self->{preformat_trigger_lines} == 0);
$self->{'endpreformat_trigger_lines'} = 0
if ($self->{endpreformat_trigger_lines} < 0);
$self->{'endpreformat_trigger_lines'} = 2
if ($self->{endpreformat_trigger_lines} > 2);
$self->{__preformat_enabled} =
(($self->{endpreformat_trigger_lines} != 0)
|| $self->{use_preformat_marker});
if ($self->{use_mosaic_header})
{
my $num_heading_styles = 0;
my %heading_styles = ();
$heading_styles{"*"} = ++$num_heading_styles;
$heading_styles{"="} = ++$num_heading_styles;
$heading_styles{"+"} = ++$num_heading_styles;
$heading_styles{"-"} = ++$num_heading_styles;
$heading_styles{"~"} = ++$num_heading_styles;
$heading_styles{"."} = ++$num_heading_styles;
$self->{__heading_styles} = \%heading_styles;
$self->{__num_heading_styles} = $num_heading_styles;
}
# XHTML implies lower case
$self->{'lower_case_tags'} = 1 if ($self->{xhtml});
}
=head2 escape
$newtext = escape($text);
Escape & < and >
=cut
sub escape ($)
{
my ($text) = @_;
$text =~ s/&/&/g;
$text =~ s/>/>/g;
$text =~ s/</g;
return $text;
}
=head2 demoronize_char
$newtext = demoronize_char($text);
Convert Microsoft character entities into characters.
Added by Alan Jackson, alan at ajackson dot org, and based
on the demoronize script by John Walker, http://www.fourmilab.ch/
=cut
sub demoronize_char($)
{
my $s = shift;
# Map strategically incompatible non-ISO characters in the
# range 0x82 -- 0x9F into plausible substitutes where
# possible.
$s =~ s/\x82/,/g;
$s =~ s/\x84/,,/g;
$s =~ s/\x85/.../g;
$s =~ s/\x88/^/g;
$s =~ s/\x8B//g;
$s =~ s/\x9C/oe/g;
return $s;
}
=head2 demoronize_code
$newtext = demoronize_code($text);
convert Microsoft character entities into HTML code
=cut
sub demoronize_code($)
{
my $s = shift;
# Map strategically incompatible non-ISO characters in the
# range 0x82 -- 0x9F into plausible substitutes where
# possible.
$s =~ s-\x83-f-g;
$s =~ s-\x98-~-g;
$s =~ s-\x99-TM-g;
return $s;
}
=head2 get_tag
$tag = $self->get_tag($in_tag);
$tag = $self->get_tag($in_tag,
tag_type=>TAG_START,
inside_tag=>'');
output the tag wanted (add the <> and the / if necessary)
- output in lower or upper case
- do tag-related processing
options:
tag_type=>TAG_START | tag_type=>TAG_END | tag_type=>TAG_EMPTY
(default start)
inside_tag=>string (default empty)
=cut
sub get_tag ($$;%)
{
my $self = shift;
my $in_tag = shift;
my %args = (
tag_type => TAG_START,
inside_tag => '',
@_
);
my $inside_tag = $args{inside_tag};
my $open_tag = @{$self->{__tags}}[$#{$self->{__tags}}];
if (!defined $open_tag)
{
$open_tag = '';
}
# close any open tags that need closing
# Note that we only have to check for the structural tags we make,
# not every possible HTML tag
my $tag_prefix = '';
if ($self->{xhtml})
{
if ( $open_tag eq 'p'
and $in_tag eq 'p'
and $args{tag_type} != TAG_END)
{
$tag_prefix = $self->close_tag('p');
}
elsif ( $open_tag eq 'p'
and $in_tag =~ /^(hr|ul|ol|dl|pre|table|h)/)
{
$tag_prefix = $self->close_tag('p');
}
elsif ( $open_tag eq 'li'
and $in_tag eq 'li'
and $args{tag_type} != TAG_END)
{
# close a LI before the next LI
$tag_prefix = $self->close_tag('li');
}
elsif ( $open_tag eq 'li'
and $in_tag =~ /^(ul|ol)$/
and $args{tag_type} == TAG_END)
{
# close the LI before the list closes
$tag_prefix = $self->close_tag('li');
}
elsif ( $open_tag eq 'dt'
and $in_tag eq 'dd'
and $args{tag_type} != TAG_END)
{
# close a DT before the next DD
$tag_prefix = $self->close_tag('dt');
}
elsif ( $open_tag eq 'dd'
and $in_tag eq 'dt'
and $args{tag_type} != TAG_END)
{
# close a DD before the next DT
$tag_prefix = $self->close_tag('dd');
}
elsif ( $open_tag eq 'dd'
and $in_tag eq 'dl'
and $args{tag_type} == TAG_END)
{
# close the DD before the list closes
$tag_prefix = $self->close_tag('dd');
}
}
my $out_tag = $in_tag;
if ($args{tag_type} == TAG_END)
{
$out_tag = $self->close_tag($in_tag);
}
else
{
if ($self->{lower_case_tags})
{
$out_tag =~ tr/A-Z/a-z/;
}
else # upper case
{
$out_tag =~ tr/a-z/A-Z/;
}
if ($args{tag_type} == TAG_EMPTY)
{
if ($self->{xhtml})
{
$out_tag = "<${out_tag}${inside_tag}/>";
}
else
{
$out_tag = "<${out_tag}${inside_tag}>";
}
}
else
{
push @{$self->{__tags}}, $in_tag;
$out_tag = "<${out_tag}${inside_tag}>";
}
}
$out_tag = $tag_prefix . $out_tag if $tag_prefix;
if ($DictDebug & 8)
{
print STDERR
"open_tag = '${open_tag}', in_tag = '${in_tag}', tag_type = ",
$args{tag_type},
", inside_tag = '${inside_tag}', out_tag = '$out_tag'\n";
}
return $out_tag;
} # get_tag
=head2 close_tag
$tag = $self->close_tag($in_tag);
close the open tag
=cut
sub close_tag ($$)
{
my $self = shift;
my $in_tag = shift;
my $open_tag = pop @{$self->{__tags}};
$in_tag ||= $open_tag;
# put the open tag back on the stack if the in-tag is not the same
if (defined $open_tag && $open_tag ne $in_tag)
{
push @{$self->{__tags}}, $open_tag;
}
my $out_tag = $in_tag;
if ($self->{lower_case_tags})
{
$out_tag =~ tr/A-Z/a-z/;
}
else # upper case
{
$out_tag =~ tr/a-z/A-Z/;
}
$out_tag = "<\/${out_tag}>";
if ($DictDebug & 8)
{
print STDERR
"close_tag: open_tag = '${open_tag}', in_tag = '${in_tag}', out_tag = '$out_tag'\n";
}
return $out_tag;
}
=head2 hrule
$self->hrule(para_lines_ref=>$para_lines,
para_action_ref=>$para_action,
ind=>0);
Deal with horizontal rules.
=cut
sub hrule ($%)
{
my $self = shift;
my %args = (
para_lines_ref => undef,
para_action_ref => undef,
ind => 0,
@_
);
my $para_lines_ref = $args{para_lines_ref};
my $para_action_ref = $args{para_action_ref};
my $ind = $args{ind};
my $hrmin = $self->{hrule_min};
if ($para_lines_ref->[$ind] =~ /^\s*([-_~=\*]\s*){$hrmin,}$/)
{
my $tag = $self->get_tag("hr", tag_type => TAG_EMPTY);
$para_lines_ref->[$ind] = "$tag\n";
$para_action_ref->[$ind] |= $HRULE;
}
elsif ($para_lines_ref->[$ind] =~ /\014/)
{
# Linefeeds become horizontal rules
$para_action_ref->[$ind] |= $HRULE;
my $tag = $self->get_tag("hr", tag_type => TAG_EMPTY);
$para_lines_ref->[$ind] =~ s/\014/\n${tag}\n/g;
}
}
=head2 shortline
$self->shortline(line_ref=>$line_ref,
line_action_ref=>$line_action_ref,
prev_ref=>$prev_ref,
prev_action_ref=>$prev_action_ref,
prev_line_len=>$prev_line_len);
Deal with short lines.
=cut
sub shortline ($%)
{
my $self = shift;
my %args = (
line_ref => undef,
line_action_ref => undef,
prev_ref => undef,
prev_action_ref => undef,
prev_line_len => 0,
@_
);
my $mode_ref = $args{mode_ref};
my $line_ref = $args{line_ref};
my $line_action_ref = $args{line_action_ref};
my $prev_ref = $args{prev_ref};
my $prev_action_ref = $args{prev_action_ref};
my $prev_line_len = $args{prev_line_len};
# Short lines should be broken even on list item lines iff the
# following line is more text. I haven't figured out how to do
# that yet. For now, I'll just not break on short lines in lists.
# (sorry)
my $tag = $self->get_tag('br', tag_type => TAG_EMPTY);
if (
${$line_ref} !~ /^\s*$/
&& ${$prev_ref} !~ /^\s*$/
&& ($prev_line_len < $self->{short_line_length})
&& !(
${$line_action_ref} &
($END | $HEADER | $HRULE | $LIST | $IND_BREAK | $PAR)
)
&& !(${$prev_action_ref} & ($HEADER | $HRULE | $BREAK | $IND_BREAK))
)
{
${$prev_ref} .= $tag . chop(${$prev_ref});
${$prev_action_ref} |= $BREAK;
}
}
=head2 is_mailheader
if ($self->is_mailheader(rows_ref=>$rows_ref))
{
...
}
Is this a mailheader line?
=cut
sub is_mailheader ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
@_
);
my $rows_ref = $args{rows_ref};
# a mail header is assumed to be the whole
# paragraph which starts with a From , From: or Newsgroups: line
if ($rows_ref->[0] =~ /^(From:?)|(Newsgroups:) /)
{
return 1;
}
return 0;
} # is_mailheader
=head2 mailheader
$self->mailheader(rows_ref=>$rows_ref);
Deal with a mailheader.
=cut
sub mailheader ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
@_
);
my $rows_ref = $args{rows_ref};
# a mail header is assumed to be the whole
# paragraph which starts with a From: or Newsgroups: line
my $tag = '';
my @rows = @{$rows_ref};
if ($self->is_mailheader(%args))
{
$self->{__mode} |= $MAILHEADER;
if ($self->{escape_HTML_chars})
{
$rows[0] = escape($rows[0]);
}
$self->anchor_mail(\$rows[0]);
chomp ${rows}[0];
$tag = $self->get_tag('p', inside_tag => " class='mail_header'");
my $tag2 = $self->get_tag('br', tag_type => TAG_EMPTY);
$rows[0] =
join('', "\n", $tag, $rows[0], $tag2, "\n");
# now put breaks on the rest of the paragraph
# apart from the last line
for (my $rn = 1; $rn < @rows; $rn++)
{
if ($self->{escape_HTML_chars})
{
$rows[$rn] = escape($rows[$rn]);
}
if ($rn != (@rows - 1))
{
$tag = $self->get_tag('br', tag_type => TAG_EMPTY);
chomp $rows[$rn];
$rows[$rn] =~ s/$/${tag}\n/;
}
}
}
@{$rows_ref} = @rows;
} # mailheader
=head2 mailquote
$self->mailquote(line_ref=>$line_ref,
line_action_ref=>$line_action_ref,
prev_ref=>$prev_ref,
prev_action_ref=>$prev_action_ref,
next_ref=>$next_ref);
Deal with quoted mail.
=cut
sub mailquote ($%)
{
my $self = shift;
my %args = (
line_ref => undef,
line_action_ref => undef,
prev_ref => undef,
prev_action_ref => undef,
next_ref => undef,
@_
);
my $line_ref = $args{line_ref};
my $line_action_ref = $args{line_action_ref};
my $prev_ref = $args{prev_ref};
my $prev_action_ref = $args{prev_action_ref};
my $next_ref = $args{next_ref};
my $tag = '';
if (
(
(${$line_ref} =~ /^\w*>/) # Handle "FF> Werewolves."
|| (${$line_ref} =~ /^[\|:]/)
) # Handle "[|:] There wolves."
&& defined($next_ref) && (${$next_ref} !~ /^\s*$/)
)
{
$tag = $self->get_tag('br', tag_type => TAG_EMPTY);
${$line_ref} =~ s/$/${tag}/;
${$line_action_ref} |= ($BREAK | $MAILQUOTE);
if (!(${$prev_action_ref} & ($BREAK | $MAILQUOTE)))
{
$tag = $self->get_tag('p', inside_tag => " class='quote_mail'");
${$prev_ref} .= $tag;
${$line_action_ref} |= $PAR;
}
}
}
=head2 subtract_modes
$newvector = subtract_modes($vector, $mask);
Subtracts modes listed in $mask from $vector.
=cut
sub subtract_modes ($$)
{
my ($vector, $mask) = @_;
return ($vector | $mask) - $mask;
}
=head2 paragraph
$self->paragraph(line_ref=>$line_ref,
line_action_ref=>$line_action_ref,
prev_ref=>$prev_ref,
prev_action_ref=>$prev_action_ref,
line_indent=>$line_indent,
prev_indent=>$prev_indent,
is_fragment=>$is_fragment,
ind=>$ind);
Detect paragraph indentation.
=cut
sub paragraph ($%)
{
my $self = shift;
my %args = (
line_ref => undef,
line_action_ref => undef,
prev_ref => undef,
prev_action_ref => undef,
line_indent => 0,
prev_indent => 0,
is_fragment => 0,
ind => 0,
@_
);
my $line_ref = $args{line_ref};
my $line_action_ref = $args{line_action_ref};
my $prev_ref = $args{prev_ref};
my $prev_action_ref = $args{prev_action_ref};
my $line_indent = $args{line_indent};
my $prev_indent = $args{prev_indent};
my $is_fragment = $args{is_fragment};
my $line_no = $args{ind};
my $tag = '';
if (
${$line_ref} !~ /^\s*$/
&& !subtract_modes(
${$line_action_ref}, $END | $MAILQUOTE | $CAPS | $BREAK
)
&& ( ${$prev_ref} =~ /^\s*$/
|| (${$line_action_ref} & $END)
|| ($line_indent > $prev_indent + $self->{par_indent}))
&& !($is_fragment && $line_no == 0)
)
{
if ( $self->{indent_par_break}
&& ${$prev_ref} !~ /^\s*$/
&& !(${$line_action_ref} & $END)
&& ($line_indent > $prev_indent + $self->{par_indent}))
{
$tag = $self->get_tag('br', tag_type => TAG_EMPTY);
${$prev_ref} .= $tag;
${$prev_ref} .= " " x $line_indent;
${$line_ref} =~ s/^ {$line_indent}//;
${$prev_action_ref} |= $BREAK;
${$line_action_ref} |= $IND_BREAK;
}
elsif ($self->{preserve_indent})
{
$tag = $self->get_tag('p');
${$prev_ref} .= $tag;
${$prev_ref} .= " " x $line_indent;
${$line_ref} =~ s/^ {$line_indent}//;
${$line_action_ref} |= $PAR;
}
else
{
$tag = $self->get_tag('p');
${$prev_ref} .= $tag;
${$line_action_ref} |= $PAR;
}
}
# detect also a continuing indentation at the same level
elsif ($self->{indent_par_break}
&& !($self->{__mode} & ($PRE | $TABLE | $LIST))
&& ${$prev_ref} !~ /^\s*$/
&& !(${$line_action_ref} & $END)
&& (${$prev_action_ref} & ($IND_BREAK | $PAR))
&& !subtract_modes(${$line_action_ref}, $END | $MAILQUOTE | $CAPS)
&& ($line_indent > $self->{par_indent})
&& ($line_indent == $prev_indent))
{
$tag = $self->get_tag('br', tag_type => TAG_EMPTY);
${$prev_ref} .= $tag;
${$prev_ref} .= " " x $line_indent;
${$line_ref} =~ s/^ {$line_indent}//;
${$prev_action_ref} |= $BREAK;
${$line_action_ref} |= $IND_BREAK;
}
}
=head2 listprefix
($prefix, $number, $rawprefix, $term) = $self->listprefix($line);
Detect and parse a list item.
=cut
sub listprefix ($$)
{
my $self = shift;
my $line = shift;
my ($prefix, $number, $rawprefix, $term);
my $bullets = $self->{bullets};
my $bullets_ordered = $self->{bullets_ordered};
my $number_match = '(\d+|[^\W\d])';
if ($bullets_ordered)
{
$number_match = '(\d+|[[:alpha:]]|[' . "${bullets_ordered}])";
}
$self->{__number_match} = $number_match;
my $term_match = '(\w\w+)';
$self->{__term_match} = $term_match;
return (0, 0, 0, 0)
if ( !($line =~ /^\s*[${bullets}]\s+\S/)
&& !($line =~ /^\s*${number_match}[\.\)\]:]\s+\S/)
&& !($line =~ /^\s*${term_match}:$/));
($term) = $line =~ /^\s*${term_match}:$/;
($number) = $line =~ /^\s*${number_match}\S\s+\S/;
$number = 0 unless defined($number);
if ( $bullets_ordered
&& $number =~ /[${bullets_ordered}]/)
{
$number = 1;
}
# That slippery exception of "o" as a bullet
# (This ought to be determined using the context of what lists
# we have in progress, but this will probably work well enough.)
if ($bullets =~ /o/ && $line =~ /^\s*o\s/)
{
$number = 0;
}
if ($term)
{
($rawprefix) = $line =~ /^(\s*${term_match}.)$/;
$prefix = $rawprefix;
$prefix =~ s/${term_match}//; # Take the term out
}
elsif ($number)
{
($rawprefix) = $line =~ /^(\s*${number_match}.)/;
$prefix = $rawprefix;
$prefix =~ s/${number_match}//; # Take the number out
}
else
{
($rawprefix) = $line =~ /^(\s*[${bullets}].)/;
$prefix = $rawprefix;
}
($prefix, $number, $rawprefix, $term);
} # listprefix
=head2 startlist
$self->startlist(prefix=>$prefix,
number=>0,
rawprefix=>$rawprefix,
term=>$term,
para_lines_ref=>$para_lines_ref,
para_action_ref=>$para_action_ref,
ind=>0,
prev_ref=>$prev_ref,
total_prefix=>$total_prefix);
Start a list.
=cut
sub startlist ($%)
{
my $self = shift;
my %args = (
prefix => '',
number => 0,
rawprefix => '',
term => '',
para_lines_ref => undef,
para_action_ref => undef,
ind => 0,
prev_ref => undef,
total_prefix => '',
@_
);
my $prefix = $args{prefix};
my $number = $args{number};
my $rawprefix = $args{rawprefix};
my $term = $args{term};
my $para_lines_ref = $args{para_lines_ref};
my $para_action_ref = $args{para_action_ref};
my $ind = $args{ind};
my $prev_ref = $args{prev_ref};
my $tag = '';
$self->{__listprefix}->[$self->{__listnum}] = $prefix;
if ($number)
{
# It doesn't start with 1,a,A. Let's not screw with it.
if (($number ne "1") && ($number ne "a") && ($number ne "A"))
{
return 0;
}
$tag = $self->get_tag('ol');
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
$self->{__list}->[$self->{__listnum}] = $OL;
}
elsif ($term)
{
$tag = $self->get_tag('dl');
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
$self->{__list}->[$self->{__listnum}] = $DL;
}
else
{
$tag = $self->get_tag('ul');
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
$self->{__list}->[$self->{__listnum}] = $UL;
}
$self->{__list_indent}->[$self->{__listnum}] = length($args{total_prefix});
$self->{__listnum}++;
$self->{__list_nice_indent} =
" " x $self->{__listnum} x $self->{indent_width};
$para_action_ref->[$ind] |= $LIST;
$para_action_ref->[$ind] |= $LIST_START;
$self->{__mode} |= $LIST;
1;
} # startlist
=head2 endlist
$self->endlist(num_lists=>0,
prev_ref=>$prev_ref,
line_action_ref=>$line_action_ref);
End N lists
=cut
sub endlist ($%)
{
my $self = shift;
my %args = (
num_lists => 0,
prev_ref => undef,
line_action_ref => undef,
@_
);
my $n = $args{num_lists};
my $prev_ref = $args{prev_ref};
my $line_action_ref = $args{line_action_ref};
my $tag = '';
for (; $n > 0; $n--, $self->{__listnum}--)
{
$self->{__list_nice_indent} =
" " x ($self->{__listnum} - 1) x $self->{indent_width};
if ($self->{__list}->[$self->{__listnum} - 1] == $UL)
{
$tag = $self->get_tag('ul', tag_type => TAG_END);
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
pop @{$self->{__list_indent}};
}
elsif ($self->{__list}->[$self->{__listnum} - 1] == $OL)
{
$tag = $self->get_tag('ol', tag_type => TAG_END);
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
pop @{$self->{__list_indent}};
}
elsif ($self->{__list}->[$self->{__listnum} - 1] == $DL)
{
$tag = $self->get_tag('dl', tag_type => TAG_END);
${$prev_ref} .= join('', $self->{__list_nice_indent}, $tag, "\n");
pop @{$self->{__list_indent}};
}
else
{
print STDERR "Encountered list of unknown type\n";
}
}
${$line_action_ref} |= $END;
$self->{__mode} ^= $LIST if (!$self->{__listnum});
} # endlist
=head2 continuelist
$self->continuelist(para_lines_ref=>$para_lines_ref,
para_action_ref=>$para_action_ref,
ind=>0,
term=>$term);
Continue a list.
=cut
sub continuelist ($%)
{
my $self = shift;
my %args = (
para_lines_ref => undef,
para_action_ref => undef,
ind => 0,
term => '',
@_
);
my $para_lines_ref = $args{para_lines_ref};
my $para_action_ref = $args{para_action_ref};
my $ind = $args{ind};
my $term = $args{term};
my $list_indent = $self->{__list_nice_indent};
my $bullets = $self->{bullets};
my $num_match = $self->{__number_match};
my $term_match = $self->{__term_match};
my $tag = '';
if ( $self->{__list}->[$self->{__listnum} - 1] == $UL
&& $para_lines_ref->[$ind] =~ /^\s*[${bullets}]\s*/)
{
$tag = $self->get_tag('li');
$para_lines_ref->[$ind] =~ s/^\s*[${bullets}]\s*/${list_indent}${tag}/;
$para_action_ref->[$ind] |= $LIST_ITEM;
}
if ($self->{__list}->[$self->{__listnum} - 1] == $OL)
{
$tag = $self->get_tag('li');
$para_lines_ref->[$ind] =~ s/^\s*${num_match}.\s*/${list_indent}${tag}/;
$para_action_ref->[$ind] |= $LIST_ITEM;
}
if ( $self->{__list}->[$self->{__listnum} - 1] == $DL
&& $term)
{
$tag = $self->get_tag('dt');
my $tag2 = $self->get_tag('dt', tag_type => TAG_END);
$term =~ s/_/ /g; # underscores are now spaces in the term
$para_lines_ref->[$ind] =~
s/^\s*${term_match}.$/${list_indent}${tag}${term}${tag2}/;
$tag = $self->get_tag('dd');
$para_lines_ref->[$ind] .= ${tag};
$para_action_ref->[$ind] |= $LIST_ITEM;
}
$para_action_ref->[$ind] |= $LIST;
} # continuelist
=head2 liststuff
$self->liststuff(para_lines_ref=>$para_lines_ref,
para_action_ref=>$para_action_ref,
para_line_indent_ref=>$para_line_indent_ref,
ind=>0,
prev_ref=>$prev_ref);
Process a list (higher-level method).
=cut
sub liststuff ($%)
{
my $self = shift;
my %args = (
para_lines_ref => undef,
para_action_ref => undef,
para_line_indent_ref => undef,
ind => 0,
prev_ref => undef,
@_
);
my $para_lines_ref = $args{para_lines_ref};
my $para_action_ref = $args{para_action_ref};
my $para_line_indent_ref = $args{para_line_indent_ref};
my $ind = $args{ind};
my $prev_ref = $args{prev_ref};
my $i;
my ($prefix, $number, $rawprefix, $term) =
$self->listprefix($para_lines_ref->[$ind]);
if (!$prefix)
{
# if the previous line is not blank
if ($ind > 0 && $para_lines_ref->[$ind - 1] !~ /^\s*$/)
{
# inside a list item
return;
}
# This might be a new paragraph within an existing list item;
# It will be the first line, and have the same indentation
# as the list's indentation.
if ( $ind == 0
&& $self->{__listnum}
&& $para_line_indent_ref->[$ind] ==
$self->{__list_indent}->[$self->{__listnum} - 1])
{
# start a paragraph
my $tag = $self->get_tag('p');
${$prev_ref} .= $tag;
$para_action_ref->[$ind] |= $PAR;
return;
}
# This ain't no list. We'll want to end all of them.
if ($self->{__listnum})
{
$self->endlist(
num_lists => $self->{__listnum},
prev_ref => $prev_ref,
line_action_ref => \$para_action_ref->[$ind]
);
}
return;
}
# If numbers with more than one digit grow to the left instead of
# to the right, the prefix will shrink and we'll fail to match the
# right list. We need to account for this.
my $prefix_alternate;
if (length("" . $number) > 1)
{
$prefix_alternate = (" " x (length("" . $number) - 1)) . $prefix;
}
# Maybe we're going back up to a previous list
for (
$i = $self->{__listnum} - 1;
($i >= 0) && ($prefix ne $self->{__listprefix}->[$i]);
$i--
)
{
if (length("" . $number) > 1)
{
last if $prefix_alternate eq $self->{__listprefix}->[$i];
}
}
my $islist;
# Measure the indent from where the text starts, not where the
# prefix starts. This won't screw anything up, and if we don't do
# it, the next line might appear to be indented relative to this
# line, and get tagged as a new paragraph.
my $bullets = $self->{bullets};
my $bullets_ordered = $self->{bullets_ordered};
my $term_match = $self->{__term_match};
my ($total_prefix) =
$para_lines_ref->[$ind] =~ /^(\s*[${bullets}${bullets_ordered}\w]+.\s*)/;
# a DL indent starts from the edge of the term, plus indent_width
if ($term)
{
($total_prefix) = $para_lines_ref->[$ind] =~ /^(\s*)${term_match}.$/;
$total_prefix .= " " x $self->{indent_width};
}
# Of course, we only use it if it really turns out to be a list.
$islist = 1;
$i++;
if (($i > 0) && ($i != $self->{__listnum}))
{
$self->endlist(
num_lists => $self->{__listnum} - $i,
prev_ref => $prev_ref,
line_action_ref => \$para_action_ref->[$ind]
);
$islist = 0;
}
elsif (!$self->{__listnum} || ($i != $self->{__listnum}))
{
if (
($para_line_indent_ref->[$ind] > 0)
|| $ind == 0
|| ($ind > 0 && ($para_lines_ref->[$ind - 1] =~ /^\s*$/))
|| ( $ind > 0
&& $para_action_ref->[$ind - 1] & ($BREAK | $HEADER | $CAPS))
)
{
$islist = $self->startlist(
prefix => $prefix,
number => $number,
rawprefix => $rawprefix,
term => $term,
para_lines_ref => $para_lines_ref,
para_action_ref => $para_action_ref,
ind => $ind,
prev_ref => $prev_ref,
total_prefix => $total_prefix
);
}
else
{
# We have something like this: "- foo" which usually
# turns out not to be a list.
return;
}
}
$self->continuelist(
para_lines_ref => $para_lines_ref,
para_action_ref => $para_action_ref,
ind => $ind,
term => $term
)
if ($self->{__mode} & $LIST);
$para_line_indent_ref->[$ind] = length($total_prefix) if $islist;
} # liststuff
=head2 get_table_type
$table_type = $self->get_table_type(rows_ref=>$rows_ref,
para_len=>0);
Figure out the table type of this table, if any
=cut
sub get_table_type ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $table_type = 0;
if ( $self->{table_type}->{DELIM}
&& $self->is_delim_table(%args))
{
$table_type = $TAB_DELIM;
}
elsif ($self->{table_type}->{ALIGN}
&& $self->is_aligned_table(%args))
{
$table_type = $TAB_ALIGN;
}
elsif ($self->{table_type}->{PGSQL}
&& $self->is_pgsql_table(%args))
{
$table_type = $TAB_PGSQL;
}
elsif ($self->{table_type}->{BORDER}
&& $self->is_border_table(%args))
{
$table_type = $TAB_BORDER;
}
return $table_type;
}
=head2 is_aligned_table
if ($self->is_aligned_table(rows_ref=>$rows_ref, para_len=>0))
{
...
}
Check if the given paragraph-array is an aligned table
=cut
sub is_aligned_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
# TABLES: spot and mark up tables. We combine the lines of the
# paragraph using the string bitwise or (|) operator, the result
# being in $spaces. A character in $spaces is a space only if
# there was a space at that position in every line of the
# paragraph. $space can be used to search for contiguous spaces
# that occur on all lines of the paragraph. If this results in at
# least two columns, the paragraph is identified as a table.
# Note that this sub must be called before checking for preformatted
# lines because a table may well have whitespace to the left, in
# which case it must not be incorrectly recognised as a preformat.
my @rows = @{$rows_ref};
my @starts;
my $spaces = '';
my $max = 0;
my $min = $para_len;
foreach my $row (@rows)
{
($spaces |= $row) =~ tr/ /\xff/c;
$min = length $row if length $row < $min;
$max = length $row if $max < length $row;
}
$spaces = substr $spaces, 0, $min;
push(@starts, 0) unless $spaces =~ /^ /;
while ($spaces =~ /((?:^| ) +)(?=[^ ])/g)
{
push @starts, pos($spaces);
}
if (2 <= @rows and 2 <= @starts)
{
return 1;
}
else
{
return 0;
}
}
=head2 is_pgsql_table
if ($self->is_pgsql_table(rows_ref=>$rows_ref, para_len=>0))
{
...
}
Check if the given paragraph-array is a Postgresql table
(the ascii format produced by Postgresql)
A PGSQL table can start with an optional table-caption,
then it has a row of column headings separated by |
then it has a row of ------+-----
then it has one or more rows of column values separated by |
then it has a row-count (N rows)
=cut
sub is_pgsql_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
# A PGSQL table must have at least 4 rows (see above).
if (@{$rows_ref} < 4)
{
return 0;
}
my @rows = @{$rows_ref};
if ($rows[0] !~ /\|/ && $rows[0] =~ /^\s*\w+/) # possible caption
{
shift @rows;
}
if (@rows < 4)
{
return 0;
}
if ($rows[0] !~ /^\s*\w+\s+\|\s+/) # Colname |
{
return 0;
}
if ($rows[1] !~ /^\s*[-]+[+][-]+/) # ----+----
{
return 0;
}
if ($rows[2] !~ /^\s*[^|]*\s+\|\s+/) # value |
{
return 0;
}
# check the last row for rowcount
if ($rows[$#rows] !~ /\(\d+\s+rows\)/)
{
return 0;
}
return 1;
}
=head2 is_border_table
if ($self->is_border_table(rows_ref=>$rows_ref, para_len=>0))
{
...
}
Check if the given paragraph-array is a Border table.
A BORDER table can start with an optional table-caption,
then it has a row of +------+-----+
then it has a row of column headings separated by |
then it has a row of +------+-----+
then it has one or more rows of column values separated by |
then it has a row of +------+-----+
=cut
sub is_border_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
# A BORDER table must have at least 5 rows (see above)
# And note that it could be indented with spaces
if (@{$rows_ref} < 5)
{
return 0;
}
my @rows = @{$rows_ref};
if ($rows[0] !~ /\|/ && $rows[0] =~ /^\s*\w+/) # possible caption
{
shift @rows;
}
if (@rows < 5)
{
return 0;
}
if ($rows[0] !~ /^\s*[+][-]+[+][-]+[+][-+]*$/) # +----+----+
{
return 0;
}
if ($rows[1] !~ /^\s*\|\s*\w+\s+\|\s+.*\|$/) # | Colname |
{
return 0;
}
if ($rows[2] !~ /^\s*[+][-]+[+][-]+[+][-+]*$/) # +----+----+
{
return 0;
}
if ($rows[3] !~ /^\s*\|\s*[^|]*\s+\|\s+.*\|$/) # | value |
{
return 0;
}
# check the last row for +------+------+
if ($rows[$#rows] !~ /^\s*[+][-]+[+][-]+[+][-+]*$/) # +----+----+
{
return 0;
}
return 1;
} # is_border_table
=head2 is_delim_table
if ($self->is_delim_table(rows_ref=>$rows_ref, para_len=>0))
{
...
}
Check if the given paragraph-array is a Delimited table.
A DELIM table can start with an optional table-caption,
then it has at least two rows which start and end and are
punctuated by a non-alphanumeric delimiter.
| val1 | val2 |
| val3 | val4 |
=cut
sub is_delim_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
#
# And note that it could be indented with spaces
if (@{$rows_ref} < 2)
{
return 0;
}
my @rows = @{$rows_ref};
if ($rows[0] !~ /[^\w\s]/ && $rows[0] =~ /^\s*\w+/) # possible caption
{
shift @rows;
}
if (@rows < 2)
{
return 0;
}
# figure out if the row starts with a possible delimiter
my $delim = '';
if ($rows[0] =~ /^\s*([^[:alnum:]])/)
{
$delim = $1;
# have to get rid of ^ and [] and \
$delim =~ s/\^//g;
$delim =~ s/\[//g;
$delim =~ s/\]//g;
$delim =~ s/\\//g;
if (!$delim) # no delimiter after all
{
return 0;
}
}
else
{
return 0;
}
# There needs to be at least three delimiters in the row
my @all_delims = ($rows[0] =~ /[${delim}]/g);
my $total_num_delims = @all_delims;
if ($total_num_delims < 3)
{
return 0;
}
# All rows must start and end with the delimiter
# and have $total_num_delims number of them
foreach my $row (@rows)
{
if ($row !~ /^\s*[${delim}]/)
{
return 0;
}
if ($row !~ /[${delim}]\s*$/)
{
return 0;
}
@all_delims = ($row =~ /[${delim}]/g);
if (@all_delims != $total_num_delims)
{
return 0;
}
}
return 1;
} # is_delim_table
=head2 tablestuff
$self->tablestuff(table_type=>0,
rows_ref=>$rows_ref,
para_len=>0);
Process a table.
=cut
sub tablestuff ($%)
{
my $self = shift;
my %args = (
table_type => 0,
rows_ref => undef,
para_len => 0,
@_
);
my $table_type = $args{table_type};
if ($table_type eq $TAB_ALIGN)
{
return $self->make_aligned_table(%args);
}
if ($table_type eq $TAB_PGSQL)
{
return $self->make_pgsql_table(%args);
}
if ($table_type eq $TAB_BORDER)
{
return $self->make_border_table(%args);
}
if ($table_type eq $TAB_DELIM)
{
return $self->make_delim_table(%args);
}
} # tablestuff
=head2 make_aligned_table
$self->make_aligned_table(rows_ref=>$rows_ref,
para_len=>0);
Make an Aligned table.
=cut
sub make_aligned_table ($%)
{
my $self = shift;
my %args = (
rows_ref => undef,
para_len => 0,
@_
);
my $rows_ref = $args{rows_ref};
my $para_len = $args{para_len};
# TABLES: spot and mark up tables. We combine the lines of the
# paragraph using the string bitwise or (|) operator, the result
# being in $spaces. A character in $spaces is a space only if
# there was a space at that position in every line of the
# paragraph. $space can be used to search for contiguous spaces
# that occur on all lines of the paragraph. If this results in at
# least two columns, the paragraph is identified as a table.
# Note that this sub must be called before checking for preformatted
# lines because a table may well have whitespace to the left, in
# which case it must not be incorrectly recognised as a preformat.
my @rows = @{$rows_ref};
my @starts;
my @ends;
my $spaces;
my $max = 0;
my $min = $para_len;
foreach my $row (@rows)
{
($spaces |= $row) =~ tr/ /\xff/c;
$min = length $row if length $row < $min;
$max = length $row if $max < length $row;
}
$spaces = substr $spaces, 0, $min;
push(@starts, 0) unless $spaces =~ /^ /;
while ($spaces =~ /((?:^| ) +)(?=[^ ])/g)
{
push @ends, pos($spaces) - length $1;
push @starts, pos($spaces);
}
shift(@ends) if $spaces =~ /^ /;
push(@ends, $max);
# Two or more rows and two or more columns indicate a table.
if (2 <= @rows and 2 <= @starts)
{
$self->{__mode} |= $TABLE;
# For each column, guess whether it should be left, centre or
# right aligned by examining all cells in that column for space
# to the left or the right. A simple majority among those cells
# that actually have space to one side or another decides (if no
# alignment gets a majority, left alignment wins by default).
my @align;
my $cell = '';
foreach my $col (0 .. $#starts)
{
my @count = (0, 0, 0, 0);
foreach my $row (@rows)
{
my $width = $ends[$col] - $starts[$col];
$cell = substr $row, $starts[$col], $width;
++$count[($cell =~ /^ / ? 2 : 0) +
($cell =~ / $/ || length($cell) < $width ? 1 : 0)];
}
$align[$col] = 0;
my $population = $count[1] + $count[2] + $count[3];
foreach (1 .. 3)
{
if ($count[$_] * 2 > $population)
{
$align[$col] = $_;
last;
}
}
}
foreach my $row (@rows)
{
$row = join '', $self->get_tag('tr'), (
map {
$cell = substr $row, $starts[$_], $ends[$_] - $starts[$_];
$cell =~ s/^ +//;
$cell =~ s/ +$//;
if ($self->{escape_HTML_chars})
{
$cell = escape($cell);
}
(
$self->get_tag(
'td',
inside_tag => (
$self->{xhtml} ? $xhtml_alignments[$align[$_]]
: (
$self->{lower_case_tags}
? $lc_alignments[$align[$_]]
: $alignments[$align[$_]]
)
)
),
$cell,
$self->close_tag('td')
);
} 0 .. $#starts
),
$self->close_tag('tr');
}
# put the