The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
package HTML::WikiConverter::MediaWiki;
use base 'HTML::WikiConverter';

use warnings;
use strict;

use URI;
use File::Basename;
use HTML::Tagset;
our $VERSION = '0.59';

=head1 NAME

HTML::WikiConverter::MediaWiki - Convert HTML to MediaWiki markup

=head1 SYNOPSIS

  use HTML::WikiConverter;
  my $wc = new HTML::WikiConverter( dialect => 'MediaWiki' );
  print $wc->html2wiki( $html );

=head1 DESCRIPTION

This module contains rules for converting HTML into MediaWiki
markup. See L<HTML::WikiConverter> for additional usage details.

=head1 ATTRIBUTES

In addition to the regular set of attributes recognized by the
L<HTML::WikiConverter> constructor, this dialect also accepts the
following attributes:

=head2 preserve_bold

Boolean indicating whether bold HTML elements should be preserved as
HTML in the wiki output rather than being converted into MediaWiki
markup.

By default, E<lt>bE<gt> and E<lt>strongE<gt> elements are converted to
wiki markup identically. But sometimes you may wish E<lt>bE<gt> tags
in the HTML to be preserved in the resulting MediaWiki markup. This
attribute allows this.

For example, if C<preserve_bold> is enabled, HTML like

  <ul>
    <li> <b>Bold</b>
    <li> <strong>Strong</strong>
  </ul>

will be converted to

  * <b>Bold</b>
  * '''Strong'''

When disabled (the default), the preceding HTML markup would be
converted into

  * '''Bold'''
  * '''Strong'''

=head2 preserve_italic

Boolean indicating whether italic HTML elements should be preserved as
HTML in the wiki output rather than being converted into MediaWiki
markup.

For example, if C<preserve_italic> is enabled, HTML like

  <ul>
    <li> <i>Italic</i>
    <li> <em>Emphasized</em>
  </ul>

will be converted to

  * <i>Italic</i>
  * ''Emphasized''

When disabled (the default), the preceding HTML markup would be
converted into

  * ''Italic''
  * ''Emphasized''

=head2 preserve_templates

Boolean indicating whether C<{{template}}> calls found in HTML should
be preserved in the wiki markup. If disabled (the default), templates
calls will be wrapped in C<E<lt>nowikiE<gt>> tags.

=head2 preserve_nowiki

Boolean indicating whether C<E<lt>nowikiE<gt>> tags found in HTML
should be preserved in the wiki markup. If disabled (the default),
nowiki tags will be replaced with their content.

=head2 pad_headings

Boolean indicating whether section headings should be padded with
spaces (eg, "== Section ==" instead of "==Section=="). Default is
false (ie, not to pad).

=cut

my @common_attrs = qw/ id class lang dir title style /;
my @block_attrs = ( @common_attrs, 'align' );
my @tablealign_attrs = qw/ align char charoff valign /;
my @tablecell_attrs = qw(
  abbr axis headers scope rowspan
  colspan nowrap width height bgcolor
);

# Fix for bug 14527
my $pre_prefix = '[jsmckaoqkjgbhazkfpwijhkixh]';

sub rules {
  my $self = shift;

  my %rules = (
    hr     => { replace => "\n----\n" },
    br     => { preserve => 1, empty => 1, attributes => [ qw/id class title style clear/ ] },
    p      => { block => 1, trim => 'both', line_format => 'single' },
    em     => { start => "''", end => "''", line_format => 'single' },
    strong => { start => "'''", end => "'''", line_format => 'single' },

    i      => { alias => 'em' },
    b      => { alias => 'strong' },

    pre    => { line_prefix => $pre_prefix, block => 1 },

    table   => { start => \&_table_start, end => "|}", block => 1, line_format => 'blocks' },
    tr      => { start => \&_tr_start },
    td      => { start => \&_td_start, end => "\n", trim => 'both', line_format => 'blocks' },
    th      => { start => \&_td_start, end => "\n", trim => 'both', line_format => 'single' },
    caption => { start => \&_caption_start, end => "\n", line_format => 'single' },

    img => { replace => \&_image },
    a   => { replace => \&_link },

    ul => { line_format => 'multi', block => 1 },
    ol => { alias => 'ul' },
    dl => { alias => 'ul' },

    li => { start => \&_li_start, trim => 'leading' },
    dt => { alias => 'li' },
    dd => { alias => 'li' },

    # Preserved elements, from MediaWiki's Sanitizer.php (http://tinyurl.com/dzj6o)
    div        => { preserve => 1, attributes => \@block_attrs },
    span       => { preserve => 1, attributes => \@block_attrs },
    blockquote => { preserve => 1, attributes => [ @common_attrs, qw/ cite / ] },
    del        => { preserve => 1, attributes => [ @common_attrs, qw/ cite datetime / ] },
    ins        => { preserve => 1, attributes => [ @common_attrs, qw/ cite datetime / ] },
    font       => { preserve => 1, attributes => [ @common_attrs, qw/ size color face / ] },

    # Headings (h1-h6)
    h1 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    h2 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    h3 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    h4 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    h5 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
    h6 => { start => \&_hr_start, end => \&_hr_end, block => 1, trim => 'both', line_format => 'single' },
  );

  my @preserved = qw/ center cite code var sup sub tt big small strike s u ruby rb rt rp /;
  push @preserved, 'i' if $self->preserve_italic;
  push @preserved, 'b' if $self->preserve_bold;
  push @preserved, 'nowiki' if $self->preserve_nowiki;
  $rules{$_} = { preserve => 1, attributes => \@common_attrs } foreach @preserved;

  return \%rules;
}

sub attributes { {
  preserve_italic => { default => 0 },
  preserve_bold => { default => 0 },
  strip_tags => { default => [ qw/ head style script ~comment title meta link object / ] },
  pad_headings => { default => 0 },
  preserve_templates => { default => 0 },
  preserve_nowiki => { default => 0 },

  # see bug #28402
# xxx  passthrough_naked_tags => { default => [ qw/ tbody thead font / ] },
  passthrough_naked_tags => { default => [ qw/ tbody thead font span / ] },
} }

sub _hr_start { 
  my( $wc, $node, $subrules ) = @_;
  ( my $level = $node->tag ) =~ s/\D//g;
  my $affix = ('=') x $level;
  return $wc->pad_headings ? "$affix " : $affix;
}

sub _hr_end {
  my( $wc, $node, $subrules ) = @_;
  ( my $level = $node->tag ) =~ s/\D//g;
  my $affix = ('=') x $level;
  return $wc->pad_headings ? " $affix" : $affix;
}

sub postprocess_output {
  my( $self, $outref ) = @_;
  $$outref =~ s/\Q$pre_prefix\E/ /g;
}

# Calculates the prefix that will be placed before each list item.
# Handles ordered, unordered, and definition list items.
sub _li_start {
  my( $self, $node, $rules ) = @_;
  my @parent_lists = $node->look_up( _tag => qr/ul|ol|dl/ );

  my $prefix = '';
  foreach my $parent ( @parent_lists ) {
    my $bullet = '';
    $bullet = '*' if $parent->tag eq 'ul';
    $bullet = '#' if $parent->tag eq 'ol';
    $bullet = ':' if $parent->tag eq 'dl';
    $bullet = ';' if $parent->tag eq 'dl' and $node->tag eq 'dt';
    $prefix = $bullet.$prefix;
  }

  return "\n$prefix ";
}

sub _link {
  my( $self, $node, $rules ) = @_;
  my $url = defined $node->attr('href') ? $node->attr('href') : '';
  my $text = $self->get_elem_contents($node);

  # Handle internal links
  if( my $title = $self->get_wiki_page( $url ) ) {
    $title =~ s/_/ /g;
    return "[[$title]]" if $text eq $title;        # no difference between link text and page title
    return "[[$text]]" if $text eq lcfirst $title; # differ by 1st char. capitalization
    return "[[$title|$text]]";                     # completely different
  }

  # Treat them as external links
  return $url if $url eq $text;
  return "[$url $text]";
}

sub _image {
  my( $self, $node, $rules ) = @_;
  return '' unless $node->attr('src');

  my $alt = $node->attr('alt') || '';
  my $img = basename( URI->new($node->attr('src'))->path );
  my $width = $node->attr('width') || '';

  return sprintf '[[Image:%s|%spx|%s]]', $img, $width, $alt if $alt and $width;
  return sprintf '[[Image:%s|%s]]', $img, $alt if $alt;
  return sprintf '[[Image:%s]]', $img;
}

sub _table_start {
  my( $self, $node, $rules ) = @_;
  my $prefix = '{|';

  my @table_attrs = (
    @common_attrs, 
    qw/ summary width border frame rules cellspacing
        cellpadding align bgcolor frame rules /
  );

  my $attrs = $self->get_attr_str( $node, @table_attrs );
  $prefix .= ' '.$attrs if $attrs;

  return $prefix."\n";
}

sub _tr_start {
  my( $self, $node, $rules ) = @_;
  my $prefix = '|-';
  
  my @tr_attrs = ( @common_attrs, 'bgcolor', @tablealign_attrs );
  my $attrs = $self->get_attr_str( $node, @tr_attrs );
  $prefix .= ' '.$attrs if $attrs;

  return '' unless $node->left or $attrs;
  return $prefix."\n";
}

# List of tags (and pseudo-tags, in the case of '~text') that are
# considered phrasal elements. Any table cells that contain only these
# elements will be placed on a single line.
my @td_phrasals = qw/ i em b strong u tt code span font sup sub br ~text s strike del ins /;
my %td_phrasals = map { $_ => 1 } @td_phrasals;

sub _td_start {
  my( $self, $node, $rules ) = @_;
  my $prefix = $node->tag eq 'th' ? '!' : '|';

  my @td_attrs = ( @common_attrs, @tablecell_attrs, @tablealign_attrs );
  my $attrs = $self->get_attr_str( $node, @td_attrs );
  $prefix .= ' '.$attrs.' |' if $attrs;

  # If there are any non-text elements inside the cell, then the
  # cell's content should start on its own line
  my @non_text = grep !$td_phrasals{$_->tag}, $node->content_list;
  my $space = @non_text ? "\n" : ' ';

  return $prefix.$space;
}

sub _caption_start {
  my( $self, $node, $rules ) = @_;
  my $prefix = '|+ ';

  my @caption_attrs = ( @common_attrs, 'align' );
  my $attrs = $self->get_attr_str( $node, @caption_attrs );
  $prefix .= $attrs.' |' if $attrs;

  return $prefix;
}

sub preprocess_node {
  my( $self, $node ) = @_;
  my $tag = defined $node->tag ? $node->tag : '';
  $self->strip_aname($node) if $tag eq 'a';
  $self->_strip_extra($node);
  $self->_nowiki_text($node) if $tag eq '~text';
  
#  # XXX font-to-span convers
#  $node->tag('span') if $tag eq 'font';
}

my $URL_PROTOCOLS = 'http|https|ftp|irc|gopher|news|mailto';
my $EXT_LINK_URL_CLASS = '[^]<>"\\x00-\\x20\\x7F]';
my $EXT_LINK_TEXT_CLASS = '[^\]\\x00-\\x1F\\x7F]';

# Text nodes matching one or more of these patterns will be enveloped
# in <nowiki> and </nowiki>

sub _wikitext_patterns {
  my $self = shift;

  # the caret in "qr/^/" seems redundant with "start_of_line" but both
  # are necessary
  my %wikitext_patterns = (
    misc     => { pattern => qr/^(?:\*|\#|\;|\:|\=|\!|\|)/m, location => 'start_of_line' },
    italic   => { pattern => qr/''/,     location => 'anywhere' },
    rule     => { pattern => qr/^----/m, location => 'start_of_line' },
    table    => { pattern => qr/^\{\|/m, location => 'start_of_line' },
    link     => { pattern => qr/\[\[/m,  location => 'anywhere' },
    template => { pattern => qr/{{/m,    location => 'anywhere' },
  );

  delete $wikitext_patterns{template} if $self->preserve_templates;
  return \%wikitext_patterns;
}

sub _nowiki_text {
  my( $self, $node ) = @_;

  my $text = defined $node->attr('text') ? $node->attr('text') : '';
  return unless $text;

  my $wikitext_patterns = $self->_wikitext_patterns;
  my $found_nowiki_text = 0;

  ANYWHERE: {
    my @anywhere_patterns =
      map { $_->{pattern} } grep { $_->{location} eq 'anywhere' } values %$wikitext_patterns;

    $found_nowiki_text++ if $self->_match( $text, \@anywhere_patterns );
  };

  START_OF_LINE: {
    last if $found_nowiki_text;

    my @sol_patterns =
      map { $_->{pattern} } grep { $_->{location} eq 'start_of_line' } values %$wikitext_patterns;

    # find closest parent that is a block-level node
    my $nearest_parent_block = $self->elem_search_lineage( $node, { block => 1 } );

    if( $nearest_parent_block ) {
      my $leftmostish_text_node = $self->_get_leftmostish_text_node( $nearest_parent_block );
      if( $leftmostish_text_node and $node == $leftmostish_text_node ) {
        # I'm the first child in this block element, so let's apply start_of_line nowiki fixes
        $found_nowiki_text++ if $self->_match( $text, \@sol_patterns );
      }
    }
  };

  if( $found_nowiki_text ) {
    $text = "<nowiki>$text</nowiki>";
  } else {
    $text =~ s~(\[\b(?:$URL_PROTOCOLS):$EXT_LINK_URL_CLASS+ *$EXT_LINK_TEXT_CLASS*?\])~<nowiki>$1</nowiki>~go;
  }

  $node->attr( text => $text );
}

sub _get_leftmostish_text_node {
  my( $self, $node ) = @_;
  return unless $node;
  return $node if $node->tag eq '~text';
  return $self->_get_leftmostish_text_node( ($node->content_list)[0] )
}

sub _match {
  my( $self, $text, $patterns ) = @_;
  $text =~ $_ && return 1 for @$patterns;
  return 0;
}

my %extra = (
 id => qr/catlinks/,
 class => qr/urlexpansion|printfooter|editsection/
);

# Delete <span class="urlexpansion">...</span> et al
sub _strip_extra {
  my( $self, $node ) = @_;
  my $tag = defined $node->tag ? $node->tag : '';

  foreach my $att_name ( keys %extra ) {
    my $att_value = defined $node->attr($att_name) ? $node->attr($att_name) : '';
    if( $att_value =~ $extra{$att_name} ) {
      $node->detach();
      $node->delete();
      return;
    }
  }
}

=head1 AUTHOR

David J. Iberri, C<< <diberri at cpan.org> >>

=head1 BUGS

Please report any bugs or feature requests to
C<bug-html-wikiconverter-mediawiki at rt.cpan.org>, or through the web
interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-WikiConverter-MediaWiki>.
I will be notified, and then you'll automatically be notified of
progress on your bug as I make changes.

=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc HTML::WikiConverter::MediaWiki

You can also look for information at:

=over 4

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/HTML-WikiConverter-MediaWiki>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/HTML-WikiConverter-MediaWiki>

=item * RT: CPAN's request tracker

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-WikiConverter-MediaWiki>

=item * Search CPAN

L<http://search.cpan.org/dist/HTML-WikiConverter-MediaWiki>

=back

=head1 COPYRIGHT & LICENSE

Copyright 2006 David J. Iberri, all rights reserved.

This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut

1;