package Text::Filter::Cooked;
use strict;
our $VERSION = "0.02";
use base q{Text::Filter};
use Carp;
# later use Encode;
=head1 NAME
Text::Filter::Cooked - Cooked reader for input files
=head1 SYNOPSIS
use Text::Filter::Cooked;
my $f = Text::Filter::Cooked->new
(input => 'myfile.dat',
comment => "#",
join_lines => "\\");
while ( my $line = $f->readline ) {
printf("%3d\t%s\n", $f->lineno, $line);
}
=head1 DESCRIPTION
Text::Filter::Cooked is a generic input reader. It takes care of a
number of things that are commonly used when reading data and
configuration files.
=over 4
=item *
Excess whitespace (leading and trailing) may be removed automatically.
Also, multiple whitespace characters may be replaced by a single blank.
=item *
Empty lines may be ignored automatically.
=item *
Lines that end with a custom defined join symbol, ususally a
backslash, are joined with the next line.
=item *
Lines that start with a custom defined comment symbol are ignored.
=back
=for later
On top of this, if the input file starts with a Unicode BOM, the input
will be correctly decoded into Perl internal format. It is also
possible to change the encoding used in a single file as often as
desired. See L.
Text::Filter::Cooked is based on Text::Filter, see L.
=cut
################ Attribute Controls ################
my %_attributes =
( ignore_empty_lines => 1,
ignore_leading_whitespace => 1,
ignore_trailing_whitespace => 1,
compress_whitespace => 1,
# later input => \&_diamond,
# later input_encoding => undef,
input_postread => 'chomp',
output_prewrite => 'newline',
comment => undef,
join_lines => undef,
_lineno => undef,
_open => 0,
);
sub _standard_atts {
my $self = shift;
my %k;
@k{ $self->SUPER::_standard_atts, keys %_attributes } = (0);
return keys %k;
}
sub _attr_default {
my ($self, $attr) = @_;
return $_attributes{$attr} if exists $_attributes{$attr};
return $self->SUPER::_attr_default($attr);
}
################ Constructor ################
=head1 CONSTRUCTOR
The constructor is called new() and takes a hash with attributes as
its parameter.
The following attributes are recognized and used by the constructor,
all others are passed to the base class, Text::Filter.
=over 4
=item ignore_empty_lines
If true, empty lines encountered in the input are ignored.
=item ignore_leading_whitespace
If true, leading whitespace encountered in the input is ignored.
=item ignore_trailing_whitespace
If true, trailing whitespace encountered in the input is ignored.
=item compress_whitespace
If true, multiple adjacent whitespace are compressed to a single space.
=item join_lines
This must be set to a string. Input lines that end with this string
(not taking the final line ending into account) are joined with the
next line read from the input.
=item comment
This must be set to a string. Input lines that start with this string
are ignored.
=for later
(but see L).
=begin later item input_encoding
Assume the input file to have this encoding.
Setting input_encoding will enable automatic and transparant handling
of different file encodings, see L.
=back
=cut
# Inherited from base class.
################ Attributes ################
=head1 METHODS
All attributes have set and get methods, e.g., C and
C.
Other methods:
=over 4
=item readline
Read a single line of input. If line ignoring is in effect, the
operation will be repeated internally until there is data to return.
=item lineno
Returns the number of the last line that was read from the input.
=item is_eof
Returns true iff the last record from the input has been read.
=back
=cut
sub set_input {
my ($self, $input) = @_;
$input = sub { $self->_diamond } if $input eq \&_diamond;
$self->SUPER::set_input($input);
}
sub set_ignore_empty_lines {
$_[0]->{ignore_empty_lines} = $_[1];
return;
}
sub get_ignore_empty_lines {
return $_[0]->{ignore_empty_lines};
}
=begin later
sub set_input_encoding {
my ($self, $enc) = @_;
$self->{input_encoding} = $enc;
if ( my $fd = $self->get_filter_input_fd ) {
binmode($fd, ':raw');
}
# warn("Input encoding = $enc\n");
return;
}
sub get_input_encoding {
return $_[0]->{input_encoding};
}
=cut
sub set_ignore_trailing_whitespace {
$_[0]->{ignore_trailing_whitespace} = $_[1];
return;
}
sub get_ignore_trailing_whitespace {
return $_[0]->{ignore_trailing_whitespace};
}
sub _set_lineno {
if ( @_ == 1 ) {
$_[0]->{_lineno}++
}
else {
$_[0]->{_lineno} = $_[1];
}
return;
}
sub _get_lineno {
return $_[0]->{_lineno};
}
sub set_comment {
my ($self, $c) = @_;
# This check will probably fail with a custom regexp engine.
$c = qr/^\Q$c\E(.*)$/ unless !defined($c) || ref($c) eq 'Regexp';
$self->{comment} = $c;
return;
}
sub get_comment {
return $_[0]->{comment};
}
sub set_ignore_leading_whitespace {
$_[0]->{ignore_leading_whitespace} = $_[1];
return;
}
sub get_ignore_leading_whitespace {
return $_[0]->{ignore_leading_whitespace};
}
sub set_compress_whitespace {
$_[0]->{compress_whitespace} = $_[1];
return;
}
sub get_compress_whitespace {
return $_[0]->{compress_whitespace};
}
sub set_join_lines {
my ($self, $v) = @_;
# This check will probably fail with a custom regexp engine.
$v = qr/^(.*)\Q$v\E$/ unless !defined($v) || ref($v) eq 'Regexp';
$self->{join_lines} = $v;
return;
}
sub get_join_lines {
return $_[0]->{join_lines};
}
sub _set_eof {
$_[0]->{_eof} = 1;
return;
}
sub _is_eof {
return $_[0]->{_eof};
}
sub _set_open {
$_[0]->{_open} = 1;
return;
}
sub _is_open {
return $_[0]->{_open};
}
################ Methods ################
sub readline {
my $self = shift;
return if $self->_is_eof;
my $post = sub {
for ( shift ) {
# Whitespace ignore + compress.
s/^\s+// if $self->get_ignore_leading_whitespace;
s/\s+$// if $self->get_ignore_trailing_whitespace;
s/\s+/ /g if $self->get_compress_whitespace;
return $_;
}
};
my $line;
my $pre;
while ( defined ($line = $self->SUPER::readline) ) {
=begin later
my $ienc = $self->get_input_encoding;
if ( $ienc && ! defined $self->_get_lineno ) {
# Detecting BOM...
if ( substr($line, 0, 2) eq "\xff\xfe" ) {
# Found BOM (BE)
$line = substr($line, 2);
$self->set_input_encoding($ienc = "utf-16-be");
}
elsif ( substr($line, 0, 2) eq "\xfe\xff" ) {
# Found BOM (LE)
$line = substr($line, 2);
$self->set_input_encoding($ienc = "utf-16le");
}
}
=cut
$self->_set_lineno;
$self->{_start_line} = $self->_get_lineno unless defined $pre;
=begin later
if ( $ienc ) {
$line = decode($ienc, $line, 0);
}
=cut
# Feature: ignore_empty_lines.
next unless $self->get_ignore_empty_lines && $line =~ /\S/;
my $t = $self->get_comment;
if ( $t && $line =~ $t ) {
=begin later
$line = $1;
if ( $line =~ /^\s*
content-type \s*
: \s*
text \s* (?: \/ \s* plain \s* )?
; \s* charset \s* = \s*
([^\s;]+)
\s* $
/mix ) {
$self->set_input_encoding($1);
}
=cut
next;
}
$t = $self->get_join_lines;
if ( $t && $line =~ $t ) {
$pre ||= "";
$pre .= $1;
next;
}
return $post->(defined $pre ? "$pre$line" : $line);
}
$self->_set_eof;
=for later
$self->set_input_encoding($self->{input_encoding});
=cut
return $post->($pre) if defined $pre;
return;
}
sub lineno {
my $self = shift;
return $self->{_start_line};
}
sub _diamond {
my $self = shift;
while ( 1 ) {
unless ( $self->_is_open ) {
return unless @ARGV;
my $argv = shift(@ARGV);
$self->{_argf} = undef;
open($self->{_argf}, '< :raw', $argv)
or die("$argv: $!\n");
$self->_set_open(1);
}
my $result = $self->{_argf}->readline;
return $result if defined $result;
close($self->{_argf});
$self->_set_open(0);
}
}
1;
__END__
=begin later head1 INPUT ENCODING
Text::Filter::Cooked is capable of dealing with input files that may
have arbitrary character encodings.
If the C attribute is set, the input data is assumed
to be in the specified encoding.
If the input file starts with a Unicode BOM marker, it will be
considered UTF-16 and decoded accordingly.
If the file contains a comment record with non-comment contents of the
form
Content-Type: text ; charset = FOO
the rest of the file is considered to be in encoding FOO.
All spaces in the Content-Type line are optional. Matching is
case-insensitive. C may be used instead of C.
Input encoding is reset to its original value after reading the last
line of a file. When reading multiple files using the default input
mechanism each file starts with the original setting of input
encoding.
=end later
=head1 EXAMPLE
This filters the input according to the specified parameters.
use Text::Filter::Cooked;
Text::Filter::Cooked->run
(input => 'myfile.dat',
comment => "#",
join_lines => "\\");
This filters the input and writes all cooked lines together with their
line numbers.
use Text::Filter::Cooked;
my $f = Text::Filter::Cooked->new
(input => 'myfile.dat',
comment => "#",
join_lines => "\\");
while ( my $line = $f->readline ) {
printf("%3d\t%s\n", $f->lineno, $line);
}
=begin later head1 EXAMPLE
use Text::Filter::Cooked;
my $f = Text::Filter::Cooked->new
(input => 'myfile.dat',
input_encoding => 'ascii',
comment => "#",
join_lines => "\\");
while ( my $line = $f->readline ) {
printf("%3d\t%s\n", $f->lineno, $line);
}
Example data file:
# This is comment, and ignored.
This is data in ASCII
This is data in ASCII
This \
will \
be glued \
together \
as one line
# Content-Type text/plain; charset=iso-8859-1
Thïs ïs dätä ïn ISØ-8859.1 (Låtin1)
Thïs ïs dätä ïn ISØ-8859.1 (Låtin1)
=end later
=head1 AUTHOR AND CREDITS
Johan Vromans (jvromans@squirrel.nl) wrote this module.
=head1 COPYRIGHT AND DISCLAIMER
This program is Copyright 2007 by Squirrel Consultancy. All
rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of either: a) the GNU General Public License as
published by the Free Software Foundation; either version 1, or (at
your option) any later version, or b) the "Artistic License" which
comes with Perl.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See either the
GNU General Public License or the Artistic License for more details.
=cut