#!/usr/bin/perl package main; use warnings; use strict; use HTML::WikiConverter; my %o = H::WC::GetOpts->get_opts(); my $wc = new HTML::WikiConverter( %o ); my $html = do { local $/; <> }; print $wc->html2wiki($html), "\n"; # # Attribute/option handling # package H::WC::GetOpts; use Params::Validate ':types'; use Getopt::Long; use Pod::Usage; sub get_opts { my %attrs = known_attributes(); my %o = map { $_ => undef } keys %attrs; my @optspec = ( \%o, 'list' => sub { warn "Installed dialects:\n"; print map "$_\n", HTML::WikiConverter->available_dialects; exit(1); }, 'help' => sub { pod2usage( -exit => 1, -verbose => 0 ) }, 'options' => sub { warn "Accepted options:\n"; print map "--$_\n", sort keys %attrs; exit(1); }, attrs2optspecs(\%attrs) ); eval { GetOptions( @optspec ) or pod2usage(2) }; die "problem parsing command-line options: $@" if $@; $o{dialect} ||= $ENV{WCDIALECT}; pod2usage(2) unless $o{dialect}; remove_ignored_opts(\%o); return %o; } sub known_attributes { my %attributes = ( %{ HTML::WikiConverter->__default_attribute_specs }, ); my @modules = map { "HTML::WikiConverter::$_" } HTML::WikiConverter->available_dialects; foreach my $module ( @modules ) { next unless eval "use $module; 1"; my %attrs = %{ $module->attributes }; foreach my $attr ( keys %attrs ) { $attributes{$attr} = $attrs{$attr}; } } delete $attributes{$_} for IGNORED_ATTRS(); # Normalize attr name with dashes foreach my $attr ( keys %attributes ) { my $new_attr = $attr; if( $new_attr =~ s/_/-/g ) { $attributes{$new_attr} = $attributes{$attr}; delete $attributes{$attr}; } } return %attributes; } sub attrs2optspecs { my $attrs = shift; my @specs = ( ); my %forced_optspecs = FORCED_OPTSPECS(); while( my( $attr, $spec ) = each %$attrs ) { my $type = $spec->{type} ? $spec->{type} : BOOLEAN; next if $type == CODEREF; next unless $attr; if( my $fspec = $forced_optspecs{$attr} ) { push @specs, $fspec; } else { my $reqopt_sym = ''; my $type_sym = ''; my $neg_sym = ''; $reqopt_sym = exists $spec->{optional} && !$spec->{optional} ? '=' : ':'; $type_sym = 's' if $type == SCALAR; $type_sym = 's@' if $type == ARRAYREF or $type == ( SCALAR | ARRAYREF ); $neg_sym = '!' if $type == BOOLEAN and $spec->{default} and $spec->{default} eq '1'; # No required/optional symbol without a type to follow it $reqopt_sym = '' unless $type_sym; my $spec = join '', $attr, $reqopt_sym, $type_sym, $neg_sym; push @specs, $spec; } } return sort @specs; } sub remove_ignored_opts { my $o = shift; my %ignored_attrs = map { $_ => 1 } IGNORED_ATTRS(); foreach my $key ( keys %$o ) { delete $o->{$key}, next if $ignored_attrs{$key}; delete $o->{$key}, next if !defined $o->{$key}; my $new_key = $key; if( $new_key =~ s/-/_/g ) { $o->{$new_key} = $o->{$key}; delete $o->{$key}; } } } sub IGNORED_ATTRS { qw/ list help options slurp / } # For forward compatibility until 'type' is specified for all dialect attributes sub FORCED_OPTSPECS { ( 'base-uri' => "base-uri:s", 'header-style' => "header-style:s", 'image-style' => "image-style:s", 'link-style' => "link-style:s", 'ordered-list-style' => "ordered-list-style:s", 'strip-tags' => "strip-tags:s\@", 'unordered-list-style' => "unordered-list-style:s", 'wiki-uri' => "wiki-uri:s\@", ) } __END__ =head1 NAME html2wiki - convert HTML into wiki markup =head1 SYNOPSIS html2wiki [options] [file] Commonly used options: --dialect=dialect Dialect name, e.g. "MediaWiki" (required unless the WCDIALECT environment variable is used) --encoding=encoding Source encoding (default is 'utf-8') --base-uri=uri Base URI for relative links --wiki-uri=uri URI fragment for wiki links --wrap-in-html Wrap input in and (enabled by default). Use --no-wrap-in-html to disable. --escape-entities Escape HTML entities within text elements (enabled by default). Use --no-escape-entities to disable. --list List installed dialects and exit --options List all recognized options (except for negations such as --no-wrap-in-html) --help Show this message and exit Additional options, including those corresponding to dialect attributes, are also supported. Consult the html2wiki man page for details. Example: html2wiki --dialect MediaWiki --encoding iso-8859-1 \ --base-uri http://en.wikipedia.org/wiki/ \ --wiki-uri http://en.wikipedia.org/wiki/ \ input.html > output.wiki =head1 DESCRIPTION C is a command-line interface to L, which it uses to convert HTML to wiki markup. =head1 DIALECTS If the dialect you provide in C<--dialect> is not installed on your system (e.g. if you specify C but have not installed its dialect module, L) a fatal error will be issued. Use C to list all available dialects on your system. Additional dialects may be downloaded from the CPAN. =head1 OPTIONS =head2 Correspondence of options and attributes Each of the options accepted by C corresponds to an HTML::WikiConverter attribute. Commonly used options described in C therefore correspond to attributes discussed in L. That section also contains other attributes that may be used as C command-line options. =head2 Mapping an attribute name to an option name While related, option names are not identical to their corresponding attribute names. The only difference is that attribute names use underscores to separate words while option names use hyphens. For example, the C attribute corresponds to the C<--base-uri> command-line option. =head2 Additional options defined in dialect modules Individual dialects may define their own attributes, and therefore make available their own command-line options to C, in addition to the ones defined by C. The same rules described above apply for converting between these attribute names and their corresponding command-line option names. For example, Markdown supports an C attribute that takes a string value. To use this attribute on the command line, one would use the C<--unordered-list-style> option. Consult individual dialect man pages for a list of supported attributes. =head2 Options that are enabled by default Attributes that take boolean values may be enabled by default. The C attribute is one such example. Because of this, C will effectively behave by default as if C<--wrap-in-html> had been specified in every invokation. If this is not desired, the option name may be prefixed with C to disable the option, as in C<--no-wrap-in-html>. =head2 Options that take multiple values Some attributes (eg, C and C) accept an array of values. To accommodate this in C, such options can be specified more than once on the command line. For example, to specify that only comment and script elements should be stripped from HTML: % html2wiki --strip-tags ~comment --strip-tags script ... =head1 INPUT/OUTPUT Input is taken from STDIN, so you may pipe the output from another program into C. For example: curl http://example.com/input.html | html2wiki --dialect MediaWiki You may also specify a file to read HTML from: html2wiki --dialect MediaWiki input.html Output is sent to STDOUT, though you may redirect it on the command line: html2wiki --dialect MediaWiki input.html > output.wiki Or you may pipe it into another program: html2wiki --dialect MediaWiki input.html | less =head1 AUTHOR David J. Iberri, C<< >> =head1 COPYRIGHT & LICENSE Copyright 2006 David J. Iberri, all rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 SEE ALSO L =cut