package Text::Tokenizer; use strict; use warnings; use Carp; require Exporter; use AutoLoader; our @ISA = qw(Exporter); # Items to export into callers namespace by default. Note: do not export # names by default without a very good reason. Use EXPORT_OK instead. # Do not simply export all your public functions/methods/constants. # This allows declaration use Tokenizer ':all'; # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK # will save memory. our %EXPORT_TAGS = ( 'all' => [ qw( TOK_UNDEF TOK_TEXT TOK_DQUOTE TOK_SQUOTE TOK_IQUOTE TOK_SIQUOTE TOK_BLANK TOK_ERROR TOK_EOL TOK_COMMENT TOK_EOF TOK_BASH_COMMENT TOK_C_COMMENT TOK_CC_COMMENT NOERR UNCLOSED_DQUOTE UNCLOSED_SQUOTE UNCLOSED_IQUOTE NOCONTEXT UNCLOSED_C_COMMENT TOK_OPT_DEFAULT TOK_OPT_NONE TOK_OPT_NOUNESCAPE TOK_OPT_SIQUOTE TOK_OPT_UNESCAPE TOK_OPT_UNESCAPE_CHARS TOK_OPT_UNESCAPE_LINES TOK_OPT_PASSCOMMENT TOK_OPT_PASS_COMMENT TOK_OPT_UNESCAPE_NQ_LINES TOK_OPT_C_COMMENT TOK_OPT_CC_COMMENT TOK_OPT_NO_BASH_COMMENT TOK_OPT_NO_IQUOTE tokenizer_options tokenizer_new tokenizer_new_strbuf tokenizer_scan tokenizer_exists tokenizer_switch tokenizer_delete tokenizer_flush tokenizer_destroy ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); our @EXPORT = qw( TOK_UNDEF TOK_TEXT TOK_DQUOTE TOK_SQUOTE TOK_IQUOTE TOK_SIQUOTE TOK_BLANK TOK_ERROR TOK_EOL TOK_COMMENT TOK_EOF TOK_BASH_COMMENT TOK_C_COMMENT TOK_CC_COMMENT NOERR UNCLOSED_DQUOTE UNCLOSED_SQUOTE UNCLOSED_IQUOTE NOCONTEXT UNCLOSED_C_COMMENT TOK_OPT_DEFAULT TOK_OPT_NONE TOK_OPT_NOUNESCAPE TOK_OPT_SIQUOTE TOK_OPT_UNESCAPE TOK_OPT_UNESCAPE_CHARS TOK_OPT_UNESCAPE_LINES TOK_OPT_PASSCOMMENT TOK_OPT_PASS_COMMENT TOK_OPT_UNESCAPE_NQ_LINES TOK_OPT_C_COMMENT TOK_OPT_CC_COMMENT TOK_OPT_NO_BASH_COMMENT TOK_OPT_NO_IQUOTE tokenizer_options tokenizer_new tokenizer_new_strbuf tokenizer_scan tokenizer_exists tokenizer_switch tokenizer_delete tokenizer_flush tokenizer_destroy ); our $VERSION = '0.4.2'; sub AUTOLOAD { # This AUTOLOAD is used to 'autoload' constants from the constant() # XS function. my $constname; our $AUTOLOAD; ($constname = $AUTOLOAD) =~ s/.*:://; croak "&Tokenizer::constant not defined" if $constname eq 'constant'; my ($error, $val) = constant($constname); if ($error) { croak $error; } { no strict 'refs'; # Fixed between 5.005_53 and 5.005_61 #XXX if ($] >= 5.00561) { #XXX *$AUTOLOAD = sub () { $val }; #XXX } #XXX else { *$AUTOLOAD = sub { $val }; #XXX } } goto &$AUTOLOAD; } require XSLoader; XSLoader::load('Text::Tokenizer', $VERSION); # Preloaded methods go here. # Autoload methods go after =cut, and are processed by the autosplit program. 1; __END__ # Below is stub documentation for your module. You'd better edit it! =head1 NAME Text::Tokenizer - Perl extension for tokenizing text(config) files =head1 SYNOPSIS use Text::Tokenizer ':all'; #open file and set add it to tokenizer inputs open(F_CONFIG, "input.conf") || die("failed to open input.conf"); $tok_id = tokenizer_new(F_CONFIG); tokenizer_options(TOK_OPT_NOUNESCAPE|TOK_OPT_PASSCOMMENT); while(1) { ($string, $tok_type, $line, $err, $errline) = tokenizer_scan(); last if($tok_type == TOK_ERROR || $tok_type == TOK_EOF); if($tok_type == TOK_TEXT) { } elsif($tok_type == TOK_BLANK) { } elsif($tok_type == TOK_DQUOTE) { $string = "\"$str\""; } elsif($tok_type == TOK_SQUOTE) { $string = "\'$str\'"; } elsif($tok_type == TOK_SIQUOTE) { $string = "\`$str\'"; } elsif($tok_type == TOK_IQUOTE) { $string = "\`$str\`"; } elsif($tok_type == TOK_EOL) { $string = "\n"; } elsif($tok_type == TOK_COMMENT) { } elsif($tok_type == TOK_UNDEF) { last; } else { last; }; print $string; } tokenizer_delete($tok_id); Very complex example of using Text::Tokenizer can be found in passwd_exp - tool for password expiration notification (http://freshmeat.net/projects/passwd_exp) =head1 DESCRIPTION B is I lexical analyzer, that can be used to process input text from file or buffer to basic I: =over 4 =item * NORMAL TEXT =item * DOUBLE QUOTED "TEXT" =item * SINGLE QUOTED 'TEXT' =item * INVERSE QUOTED 'TEXT' =item * SINGLE-INVERSE QUOTED `TEXT' =item * WHITESPACE TEXT =item * #COMMENTS =item * END OF LINE =item * END OF FILE =back =head1 EXPORT None by default. You have to selectively import methods or constants or use ':all' to import all constants & methods. =head1 CONSTANTS =over 17 =head2 I Token types that tokenizer returns. =item B Undefined token (tokenizer error) =item B Normal_text =item B "Double quoted text" =item B 'Single quoted text' =item B `Inverse quoted text` =item B `Single-inverse quoted text' =item B Whitespace text =item B #Comment =item B End of Line =item B End of File =item B Error Condition (see C) =head2 I Error codes that will tokenizer return if error happens. =item B No error =item B Unclosed double quote found =item B Unclosed single quote found =item B Unclosed inverse quote found =item B Failed to allocate tokenizer context (FATAL ERROR) =head2 I Options configurable for tokenizer. They should be OR-ed when passing to tokenizer_options. =item B Default options set, equals to TOK_OPT_NOUNESCAPE =item B Set no options. Tokenizer will do in it's default behaviour - it will not unescape anything and it will not pass comments to you. =item B Disable characters & lines unescaping. =item B Enable looking for `single-inverse quote' combination. =item B Unescape chars & lines. =item B Unescape chars (inside of quotes only) =item B Unescape lines (inside of quotes only) =item B Enable comment passing to user routines. =item B Unescape lines (outside of quotes). Escaped end of line will not terminate value processing processing. So escaped multiline text will be returned as single line string. =back =head1 METHODS =over 4 =item B<$options = tokenizer_options(OPTIONS)> Set tokenizer options. =item B<$tok_id = tokenizer_new(FILE_HANDLE)> Create new tokenizer instance(context) from FILE_HANDLE identified by B<$tok_id>. =item B<$tok_id = tokenizer_new_strbuf(BUFFER, LENGTH)> Create new tokenizer instance from string BUFFER long LENGTH characters. Return its tokenizer instance id. =item B<@tok = tokenizer_scan()> Scan current tokenizer instance, and return first token found. @tok = ($string, $type, $line, $error, $error_line) =over 10 =over 10 =item $string - found token string =item $type - it's type =item $line - current line =item $error - equals error code if error occurs =item $error_line - line number where error begins (unclosed quote position) =back =back =item B Test if tokenizer instance exists. =item B Switch to another tokenizer instance (like when you perform include statment). =item B Delete tokenizer instance (You have to do it exactly on EOF to release connection between file or buffer. =item B Flush tokenizer instance. This function discards the instance buffer's contents, so the next time the scanner attempts to match a token from the buffer, it will have to fill it. =back =head1 SEE ALSO This tokenizer is based on code generated by B - fast lexical analyzer generator (http://lex.sourceforge.net). =head1 AUTHOR Samuel Behan, E_samkob_(a)_gmail_._com_E =head1 COPYRIGHT AND LICENSE Copyright 2003-2006 by Samuel Behan This library is free software; you can redistribute it and/or modify it under the same terms of GNU/GPL v2. =cut