package Markdent::Parser::SpanParser; { $Markdent::Parser::SpanParser::VERSION = '0.22'; } use strict; use warnings; use namespace::autoclean; use re 'eval'; use List::AllUtils qw( uniq ); use Markdent::Event::AutoLink; use Markdent::Event::EndCode; use Markdent::Event::EndEmphasis; use Markdent::Event::EndHTMLTag; use Markdent::Event::EndLink; use Markdent::Event::EndStrong; use Markdent::Event::HTMLComment; use Markdent::Event::HTMLEntity; use Markdent::Event::HTMLTag; use Markdent::Event::Image; use Markdent::Event::LineBreak; use Markdent::Event::StartCode; use Markdent::Event::StartEmphasis; use Markdent::Event::StartHTMLTag; use Markdent::Event::StartLink; use Markdent::Event::StartStrong; use Markdent::Event::Text; use Markdent::Regexes qw( $HTMLComment ); use Markdent::Types qw( Str ArrayRef HashRef RegexpRef EventObject ); use Moose; use MooseX::SemiAffordanceAccessor; use MooseX::StrictConstructor; with 'Markdent::Role::SpanParser'; has __pending_events => ( traits => ['Array'], is => 'rw', isa => ArrayRef [EventObject], default => sub { [] }, init_arg => undef, handles => { _pending_events => 'elements', _add_pending_event => 'push', _clear_pending_events => 'clear', }, ); has _span_text_buffer => ( traits => ['String'], is => 'ro', isa => Str, default => q{}, init_arg => undef, handles => { _save_span_text => 'append', _has_span_text_buffer => 'length', _clear_span_text_buffer => 'clear', }, ); has _links_by_id => ( traits => ['Hash'], is => 'ro', isa => HashRef [ArrayRef], default => sub { {} }, init_arg => undef, handles => { _add_link_by_id => 'set', _get_link_by_id => 'get', }, ); has _emphasis_start_delimiter_re => ( is => 'ro', isa => RegexpRef, lazy => 1, builder => '_build_emphasis_start_delimiter_re', init_arg => undef, ); has _escape_re => ( is => 'ro', isa => RegexpRef, lazy => 1, builder => '_build_escape_re', init_arg => undef, ); has _line_break_re => ( is => 'ro', isa => RegexpRef, lazy => 1, builder => '_build_line_break_re', init_arg => undef, ); has _escapable_chars => ( is => 'ro', isa => ArrayRef [Str], lazy => 1, builder => '_build_escapable_chars', ); sub extract_link_ids { my $self = shift; my $text = shift; ${$text} =~ s/ ^ \p{SpaceSeparator}{0,3} \[ ([^]]+) \] : \p{SpaceSeparator}* \n? \p{SpaceSeparator}* (.+) \n / $self->_process_id_for_link( $1, $2 ); /egxm; } sub _process_id_for_link { my $self = shift; my $id = shift; my $id_text = shift; $id_text =~ s/\s+$//; my ( $uri, $title ) = $self->_parse_uri_and_title($id_text); $self->_add_link_by_id( $id => [ $uri, $title ] ); return q{}; } sub _parse_uri_and_title { my $self = shift; my $text = shift; $text =~ s/^\s+|\s+$//g; my ( $uri, $title ) = split /(?:\p{SpaceSeparator}|\t)+/, $text, 2; $uri = q{} unless defined $uri; $uri =~ s/^<|>$//g; $title =~ s/^"|"$//g if defined $title; return ( $uri, $title ); } sub parse_block { my $self = shift; my $text = shift; $self->_print_debug("Parsing text for span-level markup\n\n$text\n") if $self->debug(); # Note that we have to pass a _reference_ to text in order to make sure # that we are matching the same variable with /g regexes each time. $self->_parse_text( \$text ); # This catches any bad start events that were found after the last end # event, or if there were _no_ end events at all. $self->_convert_invalid_start_events_to_text('is done'); $self->_debug_pending_events('before text merging'); $self->_merge_consecutive_text_events(); $self->_debug_pending_events('after text merging'); $self->handler()->handle_event($_) for $self->_pending_events(); $self->_clear_pending_events(); return; } sub _parse_text { my $self = shift; my $text = shift; PARSE: while (1) { if ( $self->debug() && pos ${$text} ) { $self->_print_debug( "Remaining text:\n[\n" . substr( ${$text}, pos ${$text} ) . "\n]\n" ); } if ( ${$text} =~ /\G\z/gc ) { $self->_event_for_text_buffer(); last; } my @look_for = $self->_possible_span_matches(); $self->_debug_look_for(@look_for); for my $span (@look_for) { my ( $markup, @args ) = ref $span ? @{$span} : $span; my $meth = '_match_' . $markup; $self->$meth( $text, @args ) and next PARSE; } $self->_match_plain_text($text); } } sub _possible_span_matches { my $self = shift; if ( my $event = $self->_open_start_event_for_span('code') ) { return [ 'code_end', $event->delimiter() ]; } my @look_for = 'escape'; push @look_for, $self->_look_for_strong_and_emphasis(); push @look_for, 'code_start'; unless ( $self->_open_start_event_for_span('link') ) { push @look_for, qw( auto_link link image ); } push @look_for, 'html_comment', 'html_tag', 'html_entity', 'line_break'; return @look_for; } sub _look_for_strong_and_emphasis { my $self = shift; my %start; $start{strong} = $self->_open_start_event_for_span('strong'); $start{emphasis} = $self->_open_start_event_for_span('emphasis'); # If we are in both, we need to try to end the most recent one first. if ( $start{strong} && $start{emphasis} ) { my $last_saw; for my $event ( $self->_pending_events() ) { if ( $event->event_name() eq 'start_strong' ) { $last_saw = 'strong'; } elsif ( $event->event_name() eq 'start_emphasis' ) { $last_saw = 'emphasis'; } } my @order = $last_saw eq 'strong' ? qw( strong emphasis ) : qw( emphasis strong ); return map { [ $_ . '_end', $start{$_}->delimiter() ] } @order; } elsif ( $start{emphasis} ) { return ( 'strong_start', [ 'emphasis_end', $start{emphasis}->delimiter() ] ); } elsif ( $start{strong} ) { return ( [ 'strong_end', $start{strong}->delimiter() ], 'emphasis_start' ); } # We look for strong first since it's a longer version of emphasis (we # need to try to match ** before *). return ( 'strong_start', 'emphasis_start' ); } sub _open_start_event_for_span { my $self = shift; my $type = shift; my $in; for my $event ( $self->_pending_events() ) { $in = $event if $event->event_name eq 'start_' . $type; undef $in if $event->event_name eq 'end_' . $type; } return $in; } sub _build_emphasis_start_delimiter_re { my $self = shift; return qr/(?:\*|_)/; } sub _build_escapable_chars { return [ qw( \ ` * _ { } [ ] ( ) + - . ! < > ), '#' ]; } sub _build_escape_re { my $self = shift; my $chars = join q{}, uniq( @{ $self->_escapable_chars() } ); return qr/\\([\Q$chars\E])/; } sub _build_line_break_re { my $self = shift; return qr/\p{SpaceSeparator}{2}\n/; } sub _match_escape { my $self = shift; my $text = shift; my $escape_re = $self->_escape_re(); return unless ${$text} =~ / \G ($escape_re) /xgc; $self->_print_debug("Interpreting as escaped character\n\n[$1]\n") if $self->debug(); $self->_save_span_text($2); return 1; } sub _match_strong_start { my $self = shift; my $text = shift; my ($delim) = $self->_match_delimiter_start( $text, qr/(?:\*\*|__)/ ) or return; my $event = $self->_make_event( StartStrong => delimiter => $delim ); $self->_markup_event($event); return 1; } sub _match_strong_end { my $self = shift; my $text = shift; my $delim = shift; $self->_match_delimiter_end( $text, qr/\Q$delim\E/ ) or return; my $event = $self->_make_event( EndStrong => delimiter => $delim ); $self->_markup_event($event); return 1; } sub _match_emphasis_start { my $self = shift; my $text = shift; my ($delim) = $self->_match_delimiter_start( $text, $self->_emphasis_start_delimiter_re(), ) or return; my $event = $self->_make_event( StartEmphasis => delimiter => $delim ); $self->_markup_event($event); return 1; } sub _match_emphasis_end { my $self = shift; my $text = shift; my $delim = shift; $self->_match_delimiter_end( $text, $self->_emphasis_end_delimiter_re($delim), ) or return; my $event = $self->_make_event( EndEmphasis => delimiter => $delim ); $self->_markup_event($event); return 1; } sub _emphasis_end_delimiter_re { my $self = shift; my $delim = shift; return qr/\Q$delim\E/; } sub _match_code_start { my $self = shift; my $text = shift; my ($delim) = $self->_match_delimiter_start( $text, qr/\`+\p{SpaceSeparator}*/ ) or return; $delim =~ s/\p{SpaceSeparator}*$//; my $event = $self->_make_event( StartCode => delimiter => $delim ); $self->_markup_event($event); return 1; } sub _match_code_end { my $self = shift; my $text = shift; my $delim = shift; $self->_match_delimiter_end( $text, qr/\p{SpaceSeparator}*\Q$delim/ ) or return; my $event = $self->_make_event( EndCode => delimiter => $delim ); $self->_markup_event($event); return 1; } sub _match_delimiter_start { my $self = shift; my $text = shift; my $delim = shift; return unless ${$text} =~ / \G ($delim)/xgc; return $1; } sub _match_delimiter_end { my $self = shift; my $text = shift; my $delim = shift; return unless ${$text} =~ /\G $delim /xgc; return 1; } sub _match_auto_link { my $self = shift; my $text = shift; return unless ${$text} =~ /\G <( (?:https?|mailto|ftp): [^>]+ ) >/xgc; my $link = $self->_make_event( AutoLink => uri => $1 ); $self->_markup_event($link); return 1; } # Stolen from Text::Markdown my $nested_brackets; $nested_brackets = qr{ (?> # Atomic matching [^\[\]]+ # Anything other than brackets | \[ (??{ $nested_brackets }) # Recursive set of nested brackets \] )* }x; # Also stolen from Text::Markdown my $nested_parens; $nested_parens = qr{ (?> # Atomic matching [^()]+ # Anything other than parens | \( (??{ $nested_parens }) # Recursive set of nested parens \) )* }x; sub _match_link { my $self = shift; my $text = shift; my $pos = pos ${$text} || 0; # For some inexplicable reason, this regex needs to be recreated each time # the method is called or $nested_brackets && $nested_parens are # undef. Presumably this has something to do with using it in a # subroutine's lexical scope (resetting the stack on each invocation?) return unless ${$text} =~ / \G \[ ($nested_brackets) \] # link or alt text (?: \( ($nested_parens) \) | \s* \[ ( [^]]* ) \] # an id (can be empty) )? # with no id or explicit uri, use text as id /xgc; my ( $link_text, $attr ) = $self->_link_match_results( $1, $2, $3 ); unless ( defined $attr->{uri} ) { pos ${$text} = $pos if defined $pos; return; } my $start = $self->_make_event( StartLink => %{$attr} ); $self->_markup_event($start); $self->_parse_text( \$link_text ); my $end = $self->_make_event('EndLink'); $self->_markup_event($end); return 1; } sub _match_image { my $self = shift; my $text = shift; my $pos = pos ${$text} || 0; return unless ${$text} =~ / \G ! \[ ($nested_brackets) \] # link or alt text (?: \( ($nested_parens) \) | \s* \[ ( [^]]* ) \] # an id (can be empty) )? # with no id or explicit uri, use text as id /xgc; my ( $alt_text, $attr ) = $self->_link_match_results( $1, $2, $3 ); unless ( defined $attr->{uri} ) { pos ${$text} = $pos if defined $pos; return; } $attr->{alt_text} = $alt_text; my $image = $self->_make_event( Image => %{$attr} ); $self->_markup_event($image); return 1; } sub _link_match_results { my $self = shift; my $text = shift; my $uri_and_title = shift; my $id = shift; my %attr; if ( defined $uri_and_title ) { my ( $uri, $title ) = $self->_parse_uri_and_title($uri_and_title); $attr{uri} = $uri; $attr{title} = $title if defined $title; } else { unless ( defined $id && length $id ) { $id = $text; $attr{is_implicit_id} = 1; } $id =~ s/\s+/ /g; my $link = $self->_get_link_by_id($id) || []; $attr{uri} = $link->[0]; $attr{title} = $link->[1] if defined $link->[1]; $attr{id} = $id; } return ( $text, \%attr ); } sub _match_html_comment { my $self = shift; my $text = shift; return unless ${$text} =~ / \G $HTMLComment /xgcs; my $comment = $1; $self->_detab_text( \$comment ); my $event = $self->_make_event( HTMLComment => text => $comment ); $self->_markup_event($event); return 1; } my %InlineTags = map { $_ => 1 } qw( area base basefont br col frame hr img input link meta param ); sub _match_html_tag { my $self = shift; my $text = shift; return unless ${$text} =~ /\G (< [^>]+ >)/xgc; my $tag = $1; my $event; if ( $tag =~ m{^$} ) { $event = $self->_make_event( EndHTMLTag => tag => $1 ); } else { $tag =~ s/^<|>$//g; my ( $name, $attr ) = split /\s+/, $tag, 2; $attr =~ s{/\s*$}{} if defined $attr; my %attr; if ( defined $attr && $attr =~ /\S/ ) { for my $attr ( split /\s+/, $attr ) { if ( $attr =~ /=/ ) { my ( $name, $val ) = split /=/, $attr; $val =~ s/^([\"\'])(.+)\1$/$2/g; $attr{$name} = $val; } else { # A value-less attribute like in #