package Text::TWikiFormat::SAX; use base 'XML::SAX::Base'; $VERSION = '0.03'; use strict; use XML::SAX::DocumentLocator; sub new { my ($class, %params) = @_; my $self = $class->SUPER::new(%params); $self->{_onlink} = $params{onlink}; return $self; } sub _parse_bytestream { my ($self, $fh) = @_; my $parser = TWiki::SAX::Parser->new($self->{_onlink}); $parser->set_parent($self); local $/; my $text = <$fh>; $parser->parse($text); } sub _parse_characterstream { my ($self, $fh) = @_; die "parse_characterstream not supported"; } sub _parse_string { my ($self, $str) = @_; my $parser = TWiki::SAX::Parser->new($self->{_onlink}); $parser->set_parent($self); $parser->parse($str); } sub _parse_systemid { my ($self, $sysid) = @_; my $parser = TWiki::SAX::Parser->new($self->{_onlink}); $parser->set_parent($self); open(FILE, $sysid) || die "Can't open $sysid: $!"; local $/; my $text = ; $parser->parse($text); } package TWiki::SAX::Parser; use XML::SAX::Writer; use HTML::Parser; use strict; use vars qw(@ENDING_WITH_EOL @AUTO_CLOSED $p $s $e $f $LAST_HTML_TAG); @ENDING_WITH_EOL = qw(h1 h2 h3 h4 h5 h6 li); @AUTO_CLOSED = qw(nop br hr); $b = qr/.*?(?:\n|\A)/s; # beginning of line $p = qr/.*?[ \(]|\A/s; # prefix, wikitags start with, $f = qr/[\s\,\.\;\:\!\?\)]|\Z/; # finalizer, wikitags end with $s = qr/[#\[\%\<\&\?A-Za-z0-9]/; # start, words start with $e = qr/.*?[A-Za-z0-9\:\]\%\>\;\?]/s; # end, words end with sub new { my ($class, $onlink) = @_; my $self = bless { _onlink => $onlink }, $class; $self->{html_parser} = HTML::Parser->new( api_version => 3, start_h => [\&_html_tag, "self, tagname, attr, text"], end_h => [\&_html_tag, "self, tagname, text"], marked_sections => 1, ); return $self; } sub _html_tag { my $parser = shift; $LAST_HTML_TAG = [@_]; $parser->eof(); } sub set_parent { my $self = shift; $self->{parent} = shift; } sub parent { my $self = shift; return $self->{parent}; } sub parse { my $self = shift; my $sysid = $self->parent->{ParserOptions}->{Source}{SystemId}; $self->parent->set_document_locator( XML::SAX::DocumentLocator->new( sub { "" }, sub { $sysid }, sub { $self->{line_number} }, sub { 0 }, ), ); $self->parent->start_document({}); $self->parent->start_element(_element('wiki')); $self->parse_wiki(shift); $self->parent->end_element(_element('wiki', 1)); $self->parent->end_document({}); } sub _open_element { my($self, $element) = @_; $self->parent->start_element(UNIVERSAL::isa($element, 'HASH') ? $element : _element($element)); push @{ $self->{stack} }, UNIVERSAL::isa($element, 'HASH') ? $element->{Name} : $element; } sub _close_element { my($self, $element) = @_; if (!$element) { my $exists; foreach my $ewe (@ENDING_WITH_EOL) { $exists += grep { $_ eq $ewe } @{ $self->{stack} } } return unless $exists; } while(@{ $self->{stack} }) { my $s_element = pop @{ $self->{stack} }; $self->parent->end_element(_element($s_element), 1); if ($element) { return 1 if ($s_element eq $element); } elsif (grep {$s_element eq $_} @ENDING_WITH_EOL) { return 1; } } } sub _open_list { my($self, $ident, $type) = @_; my $element = _get_list_element($type); my $prev_ident = $self->{list}->[-1]->[0] || 0; my $prev_element = $self->{list}->[-1]->[1] || ''; if ($ident == $prev_ident) { if ($element ne $prev_element) { if ($prev_element) { $self->_close_element($prev_element); pop @{ $self->{list} }; } $self->_open_element($element); push @{ $self->{list} }, [$ident, $element]; } } # opening new <*l> elsif ($ident > $prev_ident) { $self->_open_element($element); push @{ $self->{list} }, [$ident, $element]; } # elsif ($ident < $prev_ident) { while ($ident < $prev_ident) { $self->_close_element($prev_element); pop @{ $self->{list} }; $self->_open_list($ident, _get_list_type($element)); $prev_ident = $self->{list}->[-1]->[0] || 0; $prev_element = $self->{list}->[-1]->[1] || ''; } ; } } sub _close_list { my $self = shift; # getting first occurence of 'ul', 'ol' my $pos = 0; foreach (0..@{$self->{stack}}) { if ($self->{stack}->[$_] && $self->{stack}->[$_] =~ /^[ou]l$/) { $pos = $_; last ; } } my $result; while(@{ $self->{stack} } > $pos) { my $s_element = pop @{ $self->{stack} }; pop @{ $self->{list} } if ($s_element eq $self->{list}->[-1]->[1]); $self->parent->end_element(_element($s_element), 1); $result++; } return $result; } sub _get_list_element { my ($type) = @_; return ('ul') if $type eq '*'; return ('ol') if $type =~ /^\w+$/; die sprintf "unknow list element : \'%s\'", $type; } sub _get_list_type { my ($element) = @_; return ('*') if $element eq 'ul'; return ('1') if $element eq 'ol'; die sprintf "unknow list element : \'%s\'", $element; } sub _handle_found { my ($self, $pre, $post, $element, $type) = @_; $self->format_text($pre); my @elements = (UNIVERSAL::isa($element, 'ARRAY')) ? @$element : ($element); foreach (@elements) { if ($type eq 'open') { $self->_open_element($_); } else { $self->_close_element($_); } } $self->format_text($post); } sub parse_wiki { my $self = shift; $self->{stack} = []; $self->{list} = [[]]; $self->{in_table} = 0; $self->{'in_tr'} = 0; $self->{in_td} = 0; $self->{parse_wiki} = 1; $self->{parse_html} = 1; my ($text) = @_; $text =~ s/\r//g; # Remove \r $text =~ s/\\\n//g; # Join lines ending in "\" $self->format_text($text); $self->_close_element('__default'); } sub format_text { my($self, $text) = @_; if ($text) { # if ($text =~ s/(.*?)//s) { $self->format_text($1); $self->_open_element('pre'); $self->{parse_wiki} = 0; $self->{parse_html} = 0; $self->format_text($text); } #
    elsif ($self->{parse_html} && $text =~ s/(.*?)
//s) {
        $self->format_text($1);
        $self->_open_element('pre');
        $self->{parse_wiki} = 0;
        $self->format_text($text);
    }
    # horizontal line
    elsif ($self->{parse_wiki} && $text =~ s/($b)-{3,}(\s)/$2/s) {
        $self->format_text($1);
        $self->parent->start_element(_element('hr'));
        $self->parent->end_element(_element('hr'), 1);
        $self->format_text($text);
    }

    # openening tags
    # 
  • elsif ($self->{parse_wiki} && $text =~ s/($b)(\t+| {3,})(\*|\w)[\.\) ]+([^\n]+)//s) { my($f1, $f2, $f3, $f4, $f5) = ($1,$2,$3,$4,$5); $self->format_text($f1); $self->_open_list(length($f2), $f3); $self->_open_element('li'); $self->format_text($f4); if ($text !~ /^\n(\t+| {3,})(\*|\w+)[\.\) ]/) { $self->_close_list(); $text =~ s/^\n//; } $self->format_text($text); } # table handling elsif ($self->{parse_wiki} && $text =~ s/($b)\|([^\n\|]+)(\|+)/\|/s) { my($cell, $finalizer) = ($2,$3); $self->format_text($1); unless ($self->{in_table}) { my $el = _element('table'); $self->_open_element($el); $self->{in_table} = 1; } unless ($self->{'in_tr'}) { $self->_open_element('tr'); $self->{'in_tr'} = 1; } my $el = _element('td'); _add_attrib($el, 'colspan', length($finalizer)) if (length($finalizer) > 1); # aligning text inside cell $cell =~ /^(\s*).*?(\s*)$/; my $l1 = length( $1 || '' ); my $l2 = length( $2 || '' ); if( $l1 >= 2 ) { if( $l2 <= 1 ) { _add_attrib($el, 'align', 'right'); } else { _add_attrib($el, 'align', 'center'); } } $self->_open_element($el); $self->format_text($cell); $self->_close_element('td'); if ($self->{'in_tr'} && $text =~ s/^\|\n//) { $self->{'in_tr'} = 0; $self->_close_element('tr'); if ($self->{'in_table'} && $text !~ /^\|/) { $self->{'in_table'} = 0; $self->_close_element('table'); } } $self->format_text($text); } # openening tags #

    .. # handles pre, post elsif ($self->{parse_wiki} && $text =~ s/($b)---(\+{1,6})\s*//s) { $self->_handle_found($1, $text, 'h'.length($2), 'open'); } # elsif ($self->{parse_wiki} && $text =~ s/($p)\*($s)/$2/s) { $self->_handle_found($1, $text, 'strong', 'open'); } # elsif ($self->{parse_wiki} && $text =~ s/($p)\_($s)/$2/s) { $self->_handle_found($1, $text, 'em', 'open'); } # elsif ($self->{parse_wiki} && $text =~ s/($p)\_\_($s)/$2/s) { $self->_handle_found($1, $text, ['strong', 'em'], 'open'); } # elsif ($self->{parse_wiki} && $text =~ s/($p)\=($s)/$2/s) { $self->_handle_found($1, $text, 'code', 'open'); } # elsif ($self->{parse_wiki} && $text =~ s/($p)\=\=($s)/$2/s) { $self->_handle_found($1, $text, ['strong', 'code'], 'open'); } # elsif ($self->{parse_wiki} && $text =~ s/(.*)\[\[([^\]]+)\](?:\[([\w\t \-]+)\])?\]//s) { my ($link, $label) = ($2,$3); $self->format_text($1); $label ||= $link; $label =~ s/([^\/])\/[^\/].*$/$1/; ($link, $label) = $self->{_onlink}->($link, $label) if $self->{_onlink}; my $el = _element('a'); _add_attrib($el, 'href', $link); $self->_open_element($el); $self->parent->characters({Data => $label}); $self->_close_element('a'); $self->format_text($text); } elsif ($self->{parse_html} && $text =~ s/^([^<]*)(<[^\/])/$2/) { $self->format_text($1); $self->{html_parser}->parse($text); my $tag = $LAST_HTML_TAG->[0]; my $el = _element($tag); foreach my $attrib (keys %{ $LAST_HTML_TAG->[1] }) { _add_attrib($el, $attrib, $LAST_HTML_TAG->[1]->{$attrib}); } $self->_open_element($el); $self->_close_element($tag) if (grep $_ eq $tag, @AUTO_CLOSED); my $tag_text = quotemeta($LAST_HTML_TAG->[2]); $text =~ s/^.*?$tag_text\n*//; $self->format_text($text); } # closing tags # elsif ($text =~ s/(.*?)<\/verbatim>//s) { $self->format_text($1); $self->_close_element('pre'); $self->{parse_wiki} = 1; $self->{parse_html} = 1; $self->format_text($text); } #

  • elsif ($self->{parse_html} && $text =~ s/(.*?)<\/pre>//s) { $self->format_text($1); $self->_close_element('pre'); $self->{parse_wiki} = 1; $self->format_text($text); } # table elsif ($self->{parse_wiki} && $text =~ s/\|(\n|\Z)//s) { if ($self->{in_td}) { $self->_close_element('td'); $self->{in_td} = 0; } if ($self->{'in_tr'}) { $self->_close_element('tr'); $self->{'in_tr'} = 0; } $self->format_text($text); } # elsif ($self->{parse_wiki} && $text =~ s/($e)\*($f)/$2/s) { $self->_handle_found($1, $text, 'strong', 'close'); } # elsif ($self->{parse_wiki} && $text =~ s/($e)\_($f)/$2/s) { $self->_handle_found($1, $text, 'em', 'close'); } # elsif ($self->{parse_wiki} && $text =~ s/($e)\_\_($f)/$2/s) { $self->_handle_found($1, $text, ['em', 'strong'], 'close'); } # elsif ($self->{parse_wiki} && $text =~ s/($e)\=($f)/$2/s) { $self->_handle_found($1, $text, 'code', 'close'); } # elsif ($self->{parse_wiki} && $text =~ s/($e)\=\=([\s\,\.\;\:\!\?\)]|\Z)/$2/s) { $self->_handle_found($1, $text, ['code', 'strong'], 'close'); } # other html elsif ($self->{parse_html} && $text =~ s/^([^<]*)(<\/)/$2/) { $self->format_text($1); $self->{html_parser}->parse($text); my ($tag, $tag_text) = @{ $LAST_HTML_TAG }; $self->_close_element($tag); $text =~ s/^.*?$tag_text\n*//; $self->format_text($text); } # default text handling elsif ($text =~ s/^([^\n]+)//) { my $t = $self->{parse_html} ? $self->deescape($1) : $1; $self->parent->characters({Data => $t}); $self->format_text($text); } elsif ($text =~ s/^\n//) { my $closed += $self->_close_element() || 0; if ($self->{parse_wiki} && !$closed) { $self->parent->start_element(_element('br')); $self->parent->end_element(_element('br'), 1); } elsif (!$self->{parse_wiki}) { $self->parent->characters({Data => "\n"}); } $self->format_text($text); } } } sub setDeEscaperRegex { my $self = shift; my $writer = $self->parent->{Handler}->{Handler}; my %escape = reverse %{ $writer->{Escape} }; $self->{DeEscaperRegex} = eval 'qr/' . join( '|', map { $_ = "\Q$_\E" } keys %escape) . '/;'; $self->{DeEscape} = \%escape; return $self; } sub deescape { my $self = shift; my $str = shift; $self->setDeEscaperRegex unless defined $self->{DeEscaperRegex}; $str =~ s/($self->{DeEscaperRegex})/$self->{DeEscape}->{$1}/oge; return $str; } sub _element { my ($name, $end) = @_; return { Name => $name, LocalName => $name, $end ? () : (Attributes => {}), NamespaceURI => '', Prefix => '', }; } sub _add_attrib { my ($el, $name, $value) = @_; $el->{Attributes}{"{}$name"} = { Name => $name, LocalName => $name, Prefix => "", NamespaceURI => "", Value => $value, }; } 1; __END__ =head1 NAME Text::WikiFormat::SAX - a SAX parser for Wiki text =head1 SYNOPSIS use Text::WikiFormat::SAX; use XML::SAX::Writer; my $output = ''; my $parser = Text::WikiFormat::SAX->new( Handler => XML::SAX::Writer->new( Output => \$output ) ); $parser->parse_string($wiki_text); print $output; =head1 DESCRIPTION This module implements a SAX parser for WikiWiki text. The code is based on Text::WikiFormat, and so only supports the formatting that module supports. =cut