package Data::FeatureFactory; use strict; use Carp; use File::Basename; use Scalar::Util; our $VERSION = '0.0404'; my $PATH = &{ sub { return dirname( (caller)[1] ) } }; my $OPEN_OPTIONS; our $CURRENT_FEATURE; my %KNOWN_FORMATS = map {;$_=>1} qw/binary normal numeric/; # check if perl can open files in utf8 { my $fh; undef $@; eval { open $fh, '<:encoding(utf8)', $0 }; if ($@) { $OPEN_OPTIONS = ''; warn qq{the open's :encoding directive not supported by your perl ($]). Files won't be opened in utf8 format.}; } else { $OPEN_OPTIONS = ':encoding(utf8)' } close $fh; } sub new : method { my ($class, $args) = @_; $class = ref $class if ref $class; croak "Too many parameters to $class->new" if @_ > 2; my $self = bless +{}, $class; if (defined $args) { croak "The parameter to ${class}->new must be a hashref with options or nothing" if ref $args ne 'HASH'; my %accepted_option = map {;$_=>1} qw(N/A); while (my ($k, $v) = each %$args) { if (not exists $accepted_option{$k}) { croak "Unexpected option '$k' passed to ${class}->new" } if ($k eq 'N/A') { $self->{'N/A'} = "$v"; } } } no strict 'refs'; if (not defined @{$class."::features"}) { croak "\@${class}::features not defined"; } our @features; *features = \@{$class."::features"}; use strict; if (not @features) { warn "$class has empty set of features. Not much fun"; } $self->{'features'} = []; my %feat_named; $self->{'feat_named'} = \%feat_named; my @featkeys; $self->{'featkeys'} = \@featkeys; $self->{'caller_path'} = dirname( (caller)[1] ); my %supported_option = ( map {;$_=>1} qw(code default format label name postproc range type values values_file) ); my %accepted_option = ( map {;$_=>1} qw(cat2num cat2num_dyna num2cat num2cat_dyna num_values_fh values_ordered) ); # parse the @features array for my $original_feature (@features) { my $feature = { %$original_feature }; if (not exists $feature->{'name'}) { croak q{There was a feature without a name. Each record in the @features array must be a hashref with a 'name' field at least}; } my $name = $feature->{'name'}; if (exists $feat_named{$name}) { croak "Feature $name specified twice in \@${class}::features"; } push @{ $self->{'features'} }, $feature; $feat_named{$name} = $feature; push @featkeys, $name; # Check if there aren't illegal options for (keys %$feature) { if (not exists $supported_option{$_}) { if (exists $accepted_option{$_}) { warn "Option '$_' you specified for feature '$name' is not supported. Be sure you know what you are doing" } else { croak "Unrecognized option '$_' specified for feature '$name'"; } } } # Check if a postprocessing subroutine is declared # If it's a CODEref, we're OK. Else try to load it. if (exists $feature->{'postproc'} and ref $feature->{'postproc'} ne 'CODE') { my $postproc = $feature->{'postproc'}; no strict 'refs'; my $postprocsub = \&{$postproc}; undef $@; eval { $postprocsub->() }; if ($@ =~ /Undefined subroutine/) { my ($package_name) = $postproc =~ /^( (?: \w+:: )+ )/x; my $ppname; if (defined $package_name and length $package_name > 0) { $package_name =~ s/::$//; local @INC = (@INC, $self->{'caller_path'}); undef $@; eval "require $package_name"; if ($@) { warn "Failed loading module '$package_name'"; } $ppname = $postproc; } else { $ppname = $class.'::'.$postproc; } $postprocsub = \&{$ppname}; undef $@; eval { $postprocsub->() }; if ($@ =~ /^Undefined subroutine/) { croak "Couldn't load postprocessing function '$postproc' ($@)" } } elsif ($@ =~ /^Undefined subroutine/) { croak "Couldn't load postprocessing function '$postproc' ($@)" } $feature->{'postproc'} = $postprocsub; } # Check if values are specified and if they are a list of values. if (exists $feature->{'values'}) { if (exists $feature->{'values_file'}) { croak "Values specified both explicitly and by file for '$name'" } my $values = $feature->{'values'}; if (ref $values eq 'HASH') { # OK, do nothing } elsif (ref $values eq 'ARRAY') { # Convert the list to a hash. my %values = map {;$_ => 1} @$values; $feature->{'values_ordered'} = $values; $feature->{'values'} = \%values; } else { my $type; if (ref $values) { $type = lc(ref $values).'ref'; } else { $type = lc(ref \$values); } croak "The values must be specified as an arrayref or hashref, not $type" } } if (exists $feature->{'values_file'}) { my $values_fn = $feature->{'values_file'}; my $opened = open my $values_fh, '<'.$OPEN_OPTIONS, $values_fn; if (not $opened) { open $values_fh, '<'.$OPEN_OPTIONS, $self->{'caller_path'}.'/'.$values_fn or croak "Couldn't open file '$values_fn' specifying values for $name" } my %values; my @values; while (<$values_fh>) { chomp; $values{$_} = 1; push @values, $_; } close $values_fh; $feature->{'values'} = \%values; $feature->{'values_ordered'} = \@values; } if (exists $feature->{'range'}) { if (exists $feature->{'values'}) { croak "Both range and values specified for feature '$name'" } $feature->{'range'} =~ /^ (.+?) \s* \.{2,} \s* (.+) $/x or croak "Malformed range '$$feature{range}' of feature '$name'. Should be in format '0 .. 5'"; my $l = $1+0; my $r = $2+0; if (not $l < $r) { croak "Invalid range '$$feature{range}' specified for feature '$name'. The left boundary must be lesser than the right one" } if ($feature->{'type'} =~ /^int/i) { $feature->{'values'} = {map {;$_ => 1} $l .. $r}; $feature->{'values_ordered'} = [$l .. $r]; } elsif ($feature->{'type'} =~ /^num/i) { $feature->{'range_l'} = $l; $feature->{'range_r'} = $r; } } if (exists $feature->{'default'}) { if (not exists $feature->{'values'} and not exists $feature->{'range_l'}) { croak "Default value '$$feature{default}' but no values specified for feature '$name'" } } if (exists $feature->{'type'}) { my $type = lc substr $feature->{'type'}, 0, 3; my $type_OK = grep {$type eq $_} qw(boo int num cat); if (not $type_OK) { croak "The type of a feature, if given, should be 'integer', 'numeric', or 'categorial'" } $feature->{'type'} = $type; # check if the values comply to the type if ($type eq 'boo') { if (exists $feature->{'values'}) { my @values = exists $feature->{'values_ordered'} ? @{ $feature->{'values_ordered'} } : values(%{ $feature->{'values'} }); if (@values > 2) { my $num_values = @values; croak "More than two values ($num_values) specified for feature '$name'" } my ($false, $true); # boolify the values for (@values) { if ($_) { if (defined $true) { croak "True value (literal: '$true', '$_') for feature '$name' specified multiple times" } $true = $_; } else { if (defined $false) { croak "False value (literal: '$false', '$_') for feature '$name' specified multiple times" } $false = $_; } $_ = $_ ? 1 : 0; } if (exists $feature->{'values_ordered'}) { $feature->{'values_ordered'} = \@values; } $feature->{'values'} = +{ map {;$_=>1} @values }; } else { $feature->{'values'} = {0 => 1, 1 => 1}; $feature->{'values_ordered'} = [0,1]; } if (exists $feature->{'default'}) { my $def = $feature->{'default'}; my @vals = values %{ $feature->{'values'} }; if (@vals > 1) { croak "Default value '$def' specified for boolean feature '$name' which has both values allowed" } unless ($def xor $vals[0]) { my $val = $def ? 'true' : 'false'; croak "Default and allowed value are both $val for feature '$name'"; } $feature->{'default'} = $def ? 1 : 0; } } elsif ($type eq 'int') { if (exists $feature->{'values'}) { my @values = exists $feature->{'values_ordered'} ? @{ $feature->{'values_ordered'} } : values(%{ $feature->{'values'} }); # integrify the values for (@values) { $_ = int $_; } if (exists $feature->{'values_ordered'}) { $feature->{'values_ordered'} = \@values; } $feature->{'values'} = +{ map {;$_=>1} @values }; } if (exists $feature->{'default'}) { $feature->{'default'} = int $feature->{'default'}; } } elsif ($type eq 'num') { # numify the features, producing warnings if used if (exists $feature->{'values'}) { my @values = exists $feature->{'values_ordered'} ? @{ $feature->{'values_ordered'} } : values(%{ $feature->{'values'} }); for (@values) { $_ += 0; } if (exists $feature->{'values_ordered'}) { $feature->{'values_ordered'} = \@values; } $feature->{'values'} = +{ map {;$_=>1} @values }; } if (exists $feature->{'default'}) { $feature->{'default'} += 0; } } } if (exists $feature->{'format'}) { my $format = $feature->{'format'}; if (not $format =~ /^ (?: normal | numeric | binary ) $/x) { croak "Invalid format '$format' specified for feature '$name'. Please specify 'normal', 'numeric' or 'binary'" } if (not exists $feature->{'values'} and $format eq 'binary') { croak "Feature '$name' has format: 'binary' specified but doesn't have values specified" } } # find the actual code of the feature my $code; no strict 'refs'; if (exists $feature->{'code'}) { $code = $feature->{'code'}; if (ref $code ne 'CODE') { croak "'code' was specified for feature '$name' but it's not a coderef" } } elsif (%{$class.'::features'} and exists ${$class.'::features'}{$name}) { $code = ${$class.'::features'}{$name}; if (ref $code ne 'CODE') { croak "Found $name in \%${class}::features but it's not a coderef" } } else { $code = *{$class.'::'.$name}{CODE}; if (ref $code ne 'CODE') { croak "Couldn't find the code (function) for feature '$name'. Define it as a function '$name' in the '$class' package. Stopped" } } $feature->{'code'} = $code; if (exists $feature->{'label'}) { my $label = $feature->{'label'}; if (ref $label eq 'ARRAY') { $feature->{'label'} = {map {;uc($_) => 1} @$label}; } elsif (ref $label) { croak "Label must be a string or an array of strings - feature '$name' has a ".ref($label).'ref' } else { $feature->{'label'} = {uc($label) => 1}; } } } # print map "*$_\n", map keys(%$_), @{ $self->{'features'} }; return $self } sub expand_names : method { my ($self, $featnames) = @_; if (not ref $featnames and exists $self->{expand_names_cache}{$featnames}) { return $self->{expand_names_cache}{$featnames} } my $orig_featnames = $featnames; my @featkeys = @{ $self->{'featkeys'} }; my %feat_named = %{ $self->{'feat_named'} }; if ($featnames eq 'ALL') { $featnames = \@featkeys; } elsif (ref $featnames eq 'ARRAY') { # $featnames = [@$featnames]; # make a copy } # features given by labels elsif ($featnames !~ /[[:lower:]]/ and $featnames =~ /[[:upper:]]/) { my @all_labels = split /\s+/, $featnames; my @plus_labels = map {s/^\+//; $_} grep {substr($_, 0, 1) ne '-'} @all_labels; my @minus_labels = map {substr $_, 1} grep {substr($_, 0, 1) eq '-'} @all_labels; # Specifying just '-LABEL' means all but those that have LABEL if (@plus_labels == 0 and @minus_labels > 0) { @plus_labels = qw(ALL); } if (grep {$_ eq 'ALL'} @minus_labels) { croak "Label 'ALL' is special and can't be used with the minus sign, as in $featnames" } $featnames = []; for my $featkey (@featkeys) { my $feature = $feat_named{ $featkey }; my $included = grep { $_ eq 'ALL' or exists $feature->{'label'}{$_} } @plus_labels; my $excluded = grep { exists $feature->{'label'}{$_} } @minus_labels; push @$featnames, $featkey if $included and not $excluded; } } else { $featnames = ["$featnames"]; } if (not ref $orig_featnames) { $self->{expand_names_cache}{$orig_featnames} = $featnames } return $featnames } sub evaluate : method { my ($self, $featnames, $format, @args) = @_; my $class = ref $self; $featnames = $self->expand_names($featnames); my @feats; if (exists $self->{evaluate_featnames_cache}{"@$featnames"}) { @feats = @{ $self->{evaluate_featnames_cache}{"@$featnames"} }; } else { my %feat_named = %{ $self->{feat_named} }; for my $featname (@$featnames) { if (not exists $feat_named{$featname}) { croak "Feature '$featname' you wish to evaluate was not found among known features (these are: @{$self->{featkeys}})" } push @feats, $feat_named{$featname}; } $self->{evaluate_featnames_cache}{"@$featnames"} = \@feats; } if (not exists $KNOWN_FORMATS{$format}) { croak "Unknown format: '$format'. Please specify one of: @{[keys %KNOWN_FORMATS]}." } for my $feature (@feats) { $self->_create_mapping($feature, $format); } if (@args == 0) { warn 'No arguments specified for the features.'; } ### Done argument checking. ### Traverse the features and evaluate them my @rv; for my $feature (@feats) { my $name = $feature->{'name'}; $CURRENT_FEATURE = $name; my $normrv = $feature->{'code'}(@args); undef $CURRENT_FEATURE; my $format = exists $feature->{'format'} ? $feature->{'format'} : $format; if (not defined $normrv and exists $self->{'N/A'}) { my $na = $self->{'N/A'}; if (exists $feature->{'type'} and $feature->{'type'} eq 'boo') { push @rv, $na; } elsif ($format eq 'binary') { # take one of the vectors in cat2bin my @dummy = @{ (values %{ $feature->{'cat2bin'} })[0] }; if (not @dummy) { croak "Couldn't determine the length of bit vector for feature '$name'," ."which was about to be evaluated in binary and returned undef" } push @rv, map $na, @dummy; } else { push @rv, $na; } } else { # Normally format the value. The eval babble is there to take care of unexpected values. undef $@; my @val = eval { _format_value($feature, $normrv, $format, @args) }; if ($@) { if (ref $@ and $@->isa('Data::FeatureFactory::SoftError')) { warn ${$@}; return } else { die $@ } } push @rv, @val; } } return @rv[0 .. $#rv] } sub _format_value { my ($feature, $normrv, $format, @args) = @_; my @rv; my $name = $feature->{'name'}; local $\; local $,; # convert to number if appropriate if (exists $feature->{'type'}) { my $type = $feature->{'type'}; if ($type eq 'num' or $type eq 'int') { $normrv += 0; } if ($type eq 'int') { $normrv = int $normrv; } if ($type eq 'boo') { $normrv = $normrv ? 1 : 0; } } # check if the value is a legal one if (exists $feature->{'values'}) { if (exists $feature->{'values'}{$normrv}) { # alles gute } elsif (exists $feature->{'default'}) { $normrv = $feature->{'default'}; } else { die Data::FeatureFactory::SoftError->new("Feature '$name' returned unexpected value '$normrv' on arguments '@args'") } } # check the range for numeric features elsif (exists $feature->{'range_l'}) { if (not exists $feature->{'range_r'}) { die "feature '$name' has range_l but not range_r"; } if ($normrv < $feature->{'range_l'}) { if (exists $feature->{'default'}) { $normrv = $feature->{'default'}; } else { die Data::FeatureFactory::SoftError->new( "Feature '$name' returned an unexpected value '$normrv' below the left allowed boundary '$$feature{range_l}'" ) } } if ($normrv > $feature->{'range_r'}) { if (exists $feature->{'default'}) { $normrv = $feature->{'default'}; } else { die Data::FeatureFactory::SoftError->new( "Feature '$name' returned an unexpected value '$normrv' above the right allowed boundary '$$feature{range_r}'" ) } } } if ($format eq 'normal') { if (exists $feature->{'postproc'}) { $normrv = $feature->{'postproc'}->($normrv); } @rv = ($normrv); } elsif ($format eq 'numeric') { if (exists $feature->{'type'} and $feature->{'type'} =~ /^( num | int | boo )$/x) { @rv = ($normrv); } elsif (exists $feature->{'cat2num'}) { if (not exists $feature->{'cat2num'}{$normrv}) { croak "Feature '$name' has the value '$normrv' for which there is no mapping to numbers" } @rv = ($feature->{'cat2num'}{$normrv}); } else { # dynamically creating the mapping my $n; if (exists $feature->{'cat2num_dyna'}{$normrv}) { $n = $feature->{'cat2num_dyna'}{$normrv}; } else { $n = ++$feature->{'num_value_max'}; $feature->{'cat2num_dyna'}{$normrv} = $n; $feature->{'num2cat_dyna'}{$n} = $normrv; my @toprint = ($normrv, $n); if (exists $feature->{'postproc'}) { my $ppd = $feature->{'postproc'}->($normrv); $feature->{'pp2cat_dyna'}{$ppd} = $normrv; push @toprint, $ppd; } print {$feature->{'num_values_fh'}} join("\t", @toprint)."\n" or croak "Couldn't print the mapping of categorial value '$normrv' to numeric value '$n' for feature '$name' to a file ($!).\n" . 'Please provide a list of values for the feature to avoid this' } @rv = ($n); } } elsif ($format eq 'binary') { if (exists $feature->{'type'} and $feature->{'type'} eq 'boo') { @rv = ($normrv); } elsif (not exists $feature->{'cat2bin'}{$normrv}) { croak "No mapping for value '$normrv' to binary in feature '$name'" } else { @rv = @{ $feature->{'cat2bin'}{$normrv} }; } } else { croak "Unrecognized format '$format'" } return @rv } sub _values_of { my ($feature) = @_; my @values; if (exists $feature->{'values_ordered'}) { @values = @{ $feature->{'values_ordered'} }; } elsif (exists $feature->{'values'}) { @values = keys %{ $feature->{'values'} }; } else { croak "Attempted to gather the values of feature '$$feature{name}', which has none specified" } if (exists $feature->{'default'} and not exists $feature->{'values'}{ $feature->{'default'} }) { push @values, $feature->{'default'}; } return @values } sub _create_mapping : method { my ($class, $feature, $format) = @_; $class = ref $class if ref $class; if (exists $feature->{'format'} and $format ne 'postprocd') { $format = $feature->{'format'}; } if (lc $format eq 'normal') { } elsif (lc $format eq 'numeric') { return if exists $feature->{'type'} and $feature->{'type'} eq 'num'; return if exists $feature->{'type'} and $feature->{'type'} eq 'int'; return if exists $feature->{'type'} and $feature->{'type'} eq 'boo'; return if exists $feature->{'cat2num'}; # Blindly trusting that what we have here is a sane mapping from the original values to numbers my $name = $feature->{'name'}; if (not exists $feature->{'values'}) { return if exists $feature->{'num_values_fh'}; warn "Categorial feature '$name' is about to be evaluated numerically but has no set of values specified"; (my $num_values_basename = $class.'__'.$name) =~ s/\W/_/g; $num_values_basename = '.FeatureFactory.'.$num_values_basename; my @filenames_to_try = ( $PATH.'/'.$num_values_basename, $ENV{'HOME'}.'/'.$num_values_basename, '/tmp/'.$num_values_basename, ); my $num_values_fh; my $opened; my $num_value_max = 0; FILENAME_R: for my $fn (@filenames_to_try) { $opened = open my $fh, '+<'.$OPEN_OPTIONS, $fn; if ($opened) { local $_; # for some reason, this is necessary to prevent crashes (Modification of read-only value) when e.g. in for(qw(a b)){ } while (<$fh>) { chomp; my ($cat, $num, $ppd) = split /\t/; $num_value_max = $num if $num > $num_value_max; $feature->{'cat2num_dyna'}{$cat} = $num; $feature->{'num2cat_dyna'}{$num} = $cat; $feature->{'pp2cat_dyna' }{$ppd} = $cat if defined $ppd; } print STDERR "Saving the mapping for feature '$name' to file $fn\n"; $feature->{'num_values_fh'} = $fh; last FILENAME_R } } # If there's no file to recover from, try to start a new one if (not $opened) { FILENAME_W: for my $fn (@filenames_to_try) { $opened = open my $fh, '>'.$OPEN_OPTIONS, $fn; if ($opened) { print STDERR "Saving the mapping for feature '$name' to file $fn\n"; $feature->{'num_values_fh'} = $fh; last FILENAME_W } }} if (not $opened) { delete $feature->{'num_values_fh'}; croak "Couldn't open a file for saving the mapping the categories of feature '$name' to numbers. " . 'Please specify the values for the feature to avoid this' } $feature->{'num_value_max'} = $num_value_max; } else { # Got values specified - create a mapping my @values = _values_of($feature); my $n = 1; for my $value (@values) { $feature->{'cat2num'}{$value} = $n; $feature->{'num2cat'}{$n} = $value; } continue { $n++; } } } elsif (lc $format eq 'binary') { return if exists $feature->{'type'} and $feature->{'type'} eq 'boo'; return if exists $feature->{'cat2bin'}; my $name = $feature->{'name'}; if (not exists $feature->{'values_ordered'} and not exists $feature->{'values'}) { croak "Attempted to convert feature '$name' to binary without specifying its values"; } my @values = _values_of($feature); my $n = 0; my @zeroes = (0) x scalar(@values); for my $value (@values) { my @vector = @zeroes; $vector[$n] = 1; $feature->{'cat2bin'}{$value} = \@vector; $feature->{'bin2cat'}{join(' ', @vector)} = $value; } continue { $n++; } } elsif ($format eq 'postprocd') { return if exists $feature->{'pp2cat'}; my $name = $feature->{'name'}; if (not exists $feature->{'postproc'}) { croak "Feature '$name' doesn't have a postprocessing function specified - can't create mapping from postprocessed values. Stopped" } my $ppfun = $feature->{'postproc'}; my @values = _values_of($feature); my %pp2cat; for my $value (@values) { my $ppd = $ppfun->($value); $pp2cat{ $ppd } = $value; } $feature->{'pp2cat'} = \%pp2cat; } else { croak "Format '$format' not recognized - please specify 'normal', 'numeric', 'binary' or 'postprocd' (should have caught this earlier)" } } sub names : method { my ($self) = @_; return map $_->{'name'}, @{ $self->{'features'} } } sub _vector_length { # how many bits will the binary representation of this feature have my ($feature) = @_; if (exists $feature->{'type'} and $feature->{'type'} eq 'boo') { return 1 } return scalar _values_of($feature) } sub _shift_value { my ($feature, $format, $values) = @_; if ($format ne 'binary') { return shift @$values } my $n = _vector_length($feature); if (@$values < $n) { croak "There's not enough fields left to shift a $format value (width $n) of feature '$$feature{name}' from a length " . scalar(@$values) . " list ('@$values')" } return splice @$values, 0, $n } sub _init_translation { my ($self, $names, $options) = @_; if (ref($names) ne 'ARRAY') { croak 'Names must be given by an arrayref' } if (ref($options) ne 'HASH') { croak 'Options must be given by a hashref' } my %accepted_options = map {;$_=>1} qw( names from_format to_format from_NA to_NA FS OFS header ignore ); for (keys %$options) { if (not exists $accepted_options{$_}) { croak "Translate does not accept option '$_'. Accepted options are: ".join(' ', keys %accepted_options).'. Stopped' } } my $from_format = $options->{'from_format'}; my $to_format = $options->{'to_format'}; for ($from_format, $to_format) { if (! m/^(?: normal | numeric | binary )$/x) { croak '{to,from}_format must be one of "normal", "numeric" or "binary"' } } if (exists $options->{'from_NA'} and exists $options->{'to_NA'}) { } elsif (exists $options->{'from_NA'} and exists $self->{'N/A'}) { $options->{'to_NA'} = $self->{'N/A'}; } elsif (exists $options->{'to_NA'} and exists $self->{'N/A'}) { $options->{'from_NA'} = $self->{'N/A'}; } elsif (exists $options->{'to_NA'}) { $options->{'from_NA'} = undef; } elsif (exists $options->{'from_NA'}) { croak 'from_NA specified but neither to_NA nor global N/A value specified' } elsif (exists $self->{'N/A'}) { $options->{'from_NA'} = $options->{'to_NA'} = $self->{'N/A'}; } if (exists $options->{'header'} and not $options->{'header'}) { delete $options->{'header'}; } if (exists $options->{'ignore'}) { my $ignore = $options->{'ignore'}; $options->{'ignore'} = []; if (not ref $ignore) { $ignore = [$ignore]; } if (ref($ignore) eq 'ARRAY') { my $has_non_nums = grep !Scalar::Util::looks_like_number($_), @$ignore; if ($has_non_nums) { warn 'Some of the specifications of columns to ignore are non-numeric' } for my $idx (@$ignore) { if ($idx < 0) { croak "Negative column indices aren't currently supported. Trailing columns are ignored always. Stopped" } $options->{'ignore'}[ $idx ] = 1; } } else { croak 'Option "ignore" can only be a column number or an array thereof. Stopped' } # Remove the names of the columns to ignore if the names come from a header if (exists $options->{'header'}) { for my $idx (sort {$b <=> $a} @$ignore) { splice @$names, $idx, 1; } } } my (@features, @widths); my %names = map {;$_=>1} $self->names; for my $name (@$names) { if (not exists $names{$name}) { croak "Feature '$name' not found among ".join(' ', $self->names).". Stopped" } my $feature = $self->{'feat_named'}{ $name }; $self->_create_mapping($feature, $from_format); $self->_create_mapping($feature, $to_format); if ($from_format eq 'normal' and exists $feature->{'postproc'}) { if (exists $feature->{'values'}) { $self->_create_mapping($feature, 'postprocd'); } elsif (exists $feature->{'format'} or $to_format eq 'normal') { # translating normal -> normal -- kein problem } elsif (join(' ', sort $from_format, $to_format) eq 'normal numeric') { # translating with dynamic mapping } else { croak "Feature '$name' is postprocessed and about to be translated from normal but has no values specified. Stopped" } } push @features, $feature; my $bin = 0; if (exists $feature->{'format'} and $feature->{'format'} eq 'binary') { $bin = 1; } elsif (exists $feature->{'format'}) {} elsif ($from_format eq 'binary') { $bin = 1; } my $width = $bin ? _vector_length($feature) : 1; push @widths, $width; } return map [$names->[$_], $features[$_], $widths[$_]], 0 .. $#features } my %x2cat = ( binary => 'bin2cat', numeric => 'num2cat', postprocd => 'pp2cat', ); sub _translate_row : method { my ($self, $descrs, $values, $options) = @_; if (ref($values) ne 'ARRAY') { croak 'Values must be given by an arrayref' } if (@$values < @$descrs) { croak "There's not enough values in the \@values array (".scalar(@$values).") to match the number of features (".scalar(@$descrs).")"; } my ($from_format, $to_format, $from_NA, $to_NA, $ignore) = @$options{qw( from_format to_format from_NA to_NA ignore)}; my $coln = 0; my @rv; FEATNAME: for my $descr (@$descrs) { my ($name, $feature, $width) = @$descr; if (defined $ignore) { while (exists $ignore->[ $coln++ ]) { push @rv, shift @$values; } } my $from_format = exists $feature->{'format'} ? $feature->{'format'} : $from_format; my $to_format = exists $feature->{'format'} ? $feature->{'format'} : $to_format; my @value = splice @$values, 0, $width; if (@value == 0) { croak "Zero-width value obtained for feature '$name'" } # Check if the value is N/A my $is_NA = 0; if (@value == 1) { my $value = $value[0]; if (defined $from_NA and $value eq $from_NA) { $is_NA = 1; } elsif (defined $to_NA and not defined $value and not defined $from_NA) { $is_NA = 1; } } else { if (defined $to_NA and not grep {defined $_} @value and not defined $from_NA) { $is_NA = 1; } elsif (defined $from_NA and not grep {$_ ne $from_NA} @value) { $is_NA = 1; } } # Append the N/A if appropriate if ($is_NA) { my $n = $to_format eq 'binary' ? _vector_length($feature) : 1; push @rv, ( ($to_NA) x $n ); next FEATNAME } if ($from_format eq $to_format) { push @rv, @value; next FEATNAME } else { my $catval; my $from_format = $from_format; if ($from_format eq 'normal' and exists $feature->{'postproc'}) { $from_format = 'postprocd'; } if ($from_format eq 'normal') { ($catval) = @value; } elsif ($from_format eq 'numeric' and exists $feature->{'type'} and $feature->{'type'} =~ /^(int|num|boo)$/) { ($catval) = @value; } elsif ($from_format eq 'binary' and exists $feature->{'type'} and $feature->{'type'} eq 'boo') { ($catval) = @value; } else { my $transfer = $x2cat{ $from_format }; if (not defined $transfer) { croak "Internal error: Unexpected value for \$from_format: '$from_format'" } if (not exists $feature->{ $transfer }) { if (exists $feature->{ $transfer.'_dyna' }) { $transfer = $transfer.'_dyna'; } else { croak "Cannot find mapping '$transfer' for feature '$name'" } } my $valval = join(' ', @value); if (not exists $feature->{ $transfer }{ $valval }) { my $hint = ''; if ($valval eq $feature->{'name'}) { $hint = ". Maybe you forgot there was a header in your file? Stopped" } croak "Unexpected value '$valval' of feature '$name' for transfer '$transfer'$hint" } $catval = $feature->{ $transfer }{ $valval }; } my @formatted = _format_value($feature, $catval, $to_format, 'NO_ARGS:TRANSLATING_ONLY'); push @rv, @formatted; } } # Append the trailing columns push @rv, @$values; return @rv } sub translate_row : method { my ($self, $names, $values, $options) = @_; $names = $self->expand_names($names); my @descrs = $self->_init_translation($names, $options); $self->_translate_row(\@descrs, $values, $options); } sub translate : method { my ($self, $source, $sink, $options) = @_; local $\; local $,; if (not defined Scalar::Util::openhandle($source)) { croak 'Source must be given by an open filehandle' } if (not defined Scalar::Util::openhandle($sink)) { croak 'Destination must be given by an open filehandle' } if (ref($options) ne 'HASH') { croak 'Options must be given by a hashref' } my $ifs = $options->{'FS'}; my $ofs = exists $options->{'OFS'} ? $options->{'OFS'} : $ifs; my @names; my @orig_header_fields; if (exists $options->{'names'}) { @names = @{ $self->expand_names($options->{'names'}) }; } elsif (exists $options->{'header'} and $options->{'header'}) { my $row = <$source>; chomp $row; @names = split /(?:\Q$ifs\E)+/, $row; @orig_header_fields = @names; } else { croak 'No feature names specified for translate' } my @descrs = $self->_init_translation(\@names, $options); # Translate the header, if there's one. if (@orig_header_fields) { my $globbin = $options->{'to_format'} eq 'binary'; my $last = pop @orig_header_fields; for my $field (@orig_header_fields) { my $nsep; if (not exists $self->{'feat_named'}{ $field }) { $nsep = 1; } else { my $feature = $self->{'feat_named'}{ $field }; my $bin = (exists $feature->{'format'} and $feature->{'format'} eq 'binary' or $globbin); $nsep = $bin ? _vector_length($feature) : 1; } print {$sink} $field, $ofs x $nsep; } print {$sink} $last, "\n"; } ROW: while (defined (my $row = <$source>)) { chomp $row; my @values = split /$ifs/, $row; undef $@; my @translated = eval { $self->_translate_row(\@descrs, \@values, $options) }; warn("$@ (line $.)"), next ROW if $@; print {$sink} join($ofs, @translated), "\n"; } } sub add_label { my ($feature, @labels) = @_; @labels = map uc($_), @labels; if (exists $feature->{'label'}) { if (ref($feature->{'label'}) eq 'ARRAY') { push @{ $feature->{'label'} }, @labels; } else { $feature->{'label'} = [$feature->{'label'}, @labels]; } } else { $feature->{'label'} = [@labels]; } } { package Data::FeatureFactory::SoftError; sub new { my ($class, $message) = @_; $message = "SoftError occurred" if not defined $message; return bless \$message, $class } } 1 __END__ =head1 NAME Data::FeatureFactory - evaluate features normally or numerically =head1 SYNOPSIS # in the module that defines features package MyFeatures; use base qw(Data::FeatureFactory); our @features = ( { name => 'no_of_letters', type => 'int', range => '0 .. 5' }, { name => 'first_letter', type => 'cat', 'values' => ['a' .. 'z'] }, ); sub no_of_letters { my ($word) = @_; return length $word } sub first_letter { my ($word) = @_; return substr $word, 0, 1 } # in the main script package main; use MyFeatures; my $f = MyFeatures->new; # evaluate all the features on all your data and format them numerically open FILEHANDLE, '>my_features.txt'; print FILEHANDLE join(' ', $f->names), "\n"; # prepend a header for my $record (@data) { my @values = $f->evaluate('ALL', 'numeric', $record); print FILEHANDLE join(' ', @values); } close FILEHANDLE; # specify the features to evaluate and gather the result in binary form my @vector = $f->evaluate([qw(no_of_letters first_letter)], 'binary', 'foo'); # translate the once evaluated features to other formats open SOURCE, 'my_features.txt'; open SINK, '>my_features.csv'; $f->translate(SOURCE, SINK, { from_format => 'numeric', to_format => 'normal', FS => ' ', OFS => ',', # fields from space-separated to comma-separated header => 1, # the names of the features are in the first row of SOURCE # names => 'ALL', # header specified, so we don't need this from_NA => 0, to_NA => 'N/A' # interpret zeroes as N/A's and substitute }); =head1 DESCRIPTION Data::FeatureFactory automates evaluation of features of data samples and optionally encodes them as numbers or as binary vectors. =head2 Defining features The features are defined as subroutines in a package inheriting from Data::FeatureFactory. A subroutine is declared to be a feature by being mentioned in the package array C<@features>. Options for the features are also specified in this array. Its minimum structure is as follows: @features = ( { name => "name of feature 1" }, { name => "name of feature 2" }, ... ) The elements of the array must be hashrefs and each of them must have a C field. Other fields can specify options for the features. These are: =over 4 =item type Specifies if the feature is C, C, C or C. Only the first three characters, case insensitive, are considered, so you can as well say C, C, C or C. The default type is categorial. Integer and numeric features will have values forced to numbers. Boolean ones will have values converted to 1/0 depending on Perl's notion of True/False. If you use warnings, you'll get one if your numeric feature returns a non-numeric string. =item values Lists the acceptable values for the feature to return. If a different value is returned by the subroutine, the whole feature vector is discarded. Alternatively, a default value can be specified. Whenever the order of the values matters, it is honored (as in transfer to numeric format). The values can be specified as an arrayref (in which case the order is regarded) or as a hashref, in which case the values are pseudo-randomly ordered, but the loading time is faster and transfer to numeric or binary format is faster as well. If the values are specified as a hashref, then keys of the hash shall contain the values of the feature and values of the hash should be 1's. =item default Specifies a default value to be substituted when the feature returns something not listed in C. =item values_file The values can either be listed directly or in a file. This option specifies its name. This option must not appear in combination with the C option. Each value shall be on one line, with no headers, no intervening whitespace no comments and no empty lines. The file is expected to be encoded in UTF-8 on perls supporting the C<:encoding> discipline for the C function. =item range In case of integer and numeric features, an allowed range can be specified instead of the values. This option cannot appear together with the C or C option. The behavior is the same as with the C option. The interval specified is closed, so returning the limit value is OK. The range shall be specified by two numeric expressions separated by two or more dots with optional surrounding whitespace - for example 2..5 or -0.5 ...... +1.000_005. The stuff around the dots are not checked to be valid numeric expressions. But you should get a warning if you use them when you supply something nonsensical. You can also specify a range for numeric (non-integer) features. The return value will be checked against it but unlike integer features, this will not generate a list of acceptible values. Therefore, range is not enough to specify for a numeric feature if you want to have it converted to binary. (though converting floating-point values to binary vectors seems rather quirky by itself) =item postproc This option defines a subroutine that is to be used as a filter for the feature's return value. It comes in handy when you, for example, have a feature returning UTF-8 encoded text and you need it to appear ASCII-encoded but you need to specify the acceptable values in UTF-8. As this use-case suggests, the postprocessing takes place after the value is checked against the list of acceptable values. The value for this option shall either be a coderef or the name of the preprocessing function. If the function is not available in the current namespace, Data::FeatureFactory will attempt to find it. The postprocessing only takes place when the feature is evaluated normally - that is, when its output is not being transformed to numeric or binary format. =item code Normally, the features are defined as subroutines in the package that inherits from Data::FeatureFactory. However, the definition can also be provided as a coderef in this option or in the C<%features> hash of the package. The priority is: 1) the C option, 2) the C<%features> hash, and 3) the package subroutine. =item format Features can be output in different ways - see below. The format in which the features are evaluated is normally specified for all features in the call to C. You can override it for specific features with this option. You'll mostly use this to prevent the target (to-predict) feature from being numified or binarified: { name => 'target', format => 'normal' }. =item label The value of this field can either be a string or an arrayref specifying a list of labels for the feature. It's usable when you want to evaluate a group of features without having to list them. See the C method for details. =back =head3 Notice Both the feature and the optional postprocessing routine are evaluated in scalar context. When the N/A option is used, then C is treated specially but if N/A is not specified, then it is not. Assume you have a feature with values specified, a default value and the feature returns undef. Then if you use the N/A option, the N/A value is substituted, but if you don't use the N/A option, the default value is substituted instead (or it is left as an empty string if it's a valid value). The postprocessing subroutine, if specified, can be called several times during the construction of the object and within any methods. So it's highly advisable for postprocessing subroutines to have no side-effects. =head2 Creating the features object The C method creates an object that can then be used to evaluate features. Please do *not* override the C method. If you do, then be sure that it calls C properly. This method accepts an optional argument - a hashref with options. Currently, only the 'N/A' option is supported. See below for details. =head2 Getting the list of defined features The C method returns a list of names of all the features defined. =head2 Evaluating features The C method of Data::FeatureFactory takes these arguments: 1) names of the features to evaluate, 2) the format in which they should be output and 3) arguments for the features themselves. The first argument can be an arrayref with the names of the features, or it can be a string. In case it's a string containing lowercase letters, then it's interpreted as the name of the only feature to evaluate. If all the letters in the string are UPPERCASE, then it's interpreted as a whitespace-separated list of labels. Each label can be prefixed by a C<-> sign or a C<+> sign. No prefix is the same as the C<+> prefix. The features evaluated are then those, who have at least one C<+>label but no C<->label. The presence of a negative label overrides the presence of a positive one in case of collisions. There is a special label C, which you should never define for a feature and which will match any feature. It must not be used with the minus sign. Only specifying negative labels implies the C label, so you can write C<-TARGET> to get all but the target features. The features are sorted by how they appear in the @features array - the order of labels has no effect whatsoever and no feature is added twice. C is the same as just C. Since labels can only be specified in upper case for the C method, they are matched case-insensitive. The second argument is C, C or C. C means that the features' return values should be left alone (but postprocessed if such option is set). C and C mean that the features' return values should be converted into numbers or binary vectors, as for support vector machines or neural networks to like them. The return value is the list of what the features returned. In case of binary, there can be a different (typically greater) number of elements in the returned list than there were features to evaluate. During evaluation, the features can access the C<$Data::FeatureFactory::CURRENT_FEATURE> variable, which holds the name of the feature evaluated. =head3 Transfer to numeric / binary form When you have the features output in numeric format, then integer and numeric features are left alone and categorial ones have a natural number (starting with 1) assigned to every distinct value. If you use this feature, it is highly recommended to specify the values for the feature. If you don't then Data::FeatureFactory will attempt to create a mapping from the categories to numbers dynamically as then feature is evaluated. The mapping is being saved to a file whose name is C<.FeatureFactory.I__I> and is located in the directory where Data::FeatureFactory resides if possible, or in your home directory or in /tmp - wherever the script can write. If none works, then you get a fatal error. The mapping is restored and extended upon subsequent runs with the same package and feature name, if read/write permissions don't change. Binary format is such that the return value is converted to a vector of all 0's and one 1. The positions in the vector represent the possible values of the feature and 1 is on the position that the feature actually has in that particular case. The values always need to be specified for this feature to work and it is highly recommended that they be specified with a fixed order (not by a hash), because else the order can change with different versions of perl and when you change the set of values for the feature. And when the order changes, then the meaning of the vectors change. =head2 N/A values You can specify a value to be substituted when a feature returns nothing (an undefined value). This is passed as an argument to the C method. $f = MyFeatures->new({ 'N/A' => '_' }); # MyFeatures inherits from Data::FeatureFactory $v = $f->evaluate('feature1', 'normal', 'unexpected_argument'); If C returns an undefined value, then $v will contain the string '_'. When evaluating in binary format, a vector of the usual length is returned, with all values being the specified N/A. That is, if C has 3 possible values, then @v = $f->evaluate('feature1', 'binary', 'unexpected_argument'); will result in @v being C<('_', '_', '_')>. If C returns undef, that is. N/A values don't get postprocessed in case a postprocessing function is specified. =head2 Conversion between formats Once you evaluate the features on a million observations and save it in a file, you might want to get the values in another format without having to evaluate the features all over again (which can be time consuming). This is where the method C comes in handy. C accepts three arguments: Source filehandle, destination filehandle and a hash with options (some of which aren't actually optional). The options are: =over 4 =item from_format =item to_format C, C or C. =item names The names of the features that are in the source file, in order. This can be anything that the C method accepts: An arrayref with the actual names of the features present, or a label expression (see L). =item header If the names of the features are in the first line of the source file, don't specify the C option but set the C
option to a true value instead. The names of the features in the header shall be separated with the same string that separates the values on the following lines. There can be any number of separators between the feature names and Data::FeatureFactory will treat C exactly the same as C (assuming you use comma as separator). This only applies in the header and has a reason: When the header is in the source file, then it's translated to the output as well. And since in the binary format, the features span usually more than one column, Data::FeatureFactory::translate will put so many separators after each feature name as there are columns to its value. This is so you need not use the module to figure out how many digits each feature has. For example, if you have feature C with three possible values, then its name in the header will be followed by three separators: C. When reading the header, this is discarded because 1) You may want to write the header yourself or use one from a non-binary version and 2) Data::FeatureFactory has all information it needs in the @features array. =item FS The field separator that delimits the values in the files. =item OFS The output field separator: If you want the values separated with a different character / string in the destination file than in the source file, then this is the option to use. =item from_NA =item to_NA The value specified for C is interpreted as denoting the N/A values in the source file. These values will be converted to C in the destination file. If the C object has the N/A option specified, then that value is assumed for any of these two options implicitly. =back Required options are: from_format, to_format, FS and either names or header. Note that when translating a categorial feature without values specified to/from numeric format, then the dynamic mapping of values created by C must get resumed successfully. Otherwise you'll get an error about unexpected value as soon as the translation is attempted. =head3 Translating rows (not files) There's also a lower-level method available: C. Unlike the C method, it doesn't accept filehandles but accepts an arrayref with values and returns an array with the translated values. The arguments are: =over 4 =item names This time a required argument, same as the C option to C. =item values The arrayref of values to convert. =item options Same as with the C method, except the C and C
options are not accepted. =back This method has the slight difference over C (beside only translating one row per call) that if the C option is specified but neither C option to the method nor the C option to the object are specified, then undef's are interpreted as N/A values. =head2 Other low-level methods There are some other subroutines defined in C. One of those that might be of use to you is the C method, which you can give a label expression as an argument and it will give you the list of feature names that this label expression represents. For a description of how labels work, see L. =head1 COPYRIGHT Copyright (c) 2008 Oldrich Kruza. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut