package Chemistry::File::SLN; $VERSION = "0.11"; # $Id: SLN.pm,v 1.4 2005/03/29 16:38:06 itubert Exp $ use 5.006; use strict; use warnings; use base "Chemistry::File"; use Chemistry::Mol; use Chemistry::File::SLN::Parser; use Chemistry::Bond::Find 'assign_bond_orders'; use List::Util qw(sum); =head1 NAME Chemistry::File::SLN - SLN linear notation parser/writer =head1 SYNOPSYS #!/usr/bin/perl use Chemistry::File::SLN; # parse a SLN string for benzene my $s = 'C[1]H:CH:CH:CH:CH:CH@1'; my $mol = Chemistry::Mol->parse($s, format => 'sln'); # print a SLN string print $mol->print(format => 'sln'); # print a unique (canonical) SLN string print $mol->print(format => 'sln', unique => 1); # parse a multiline SLN file my @mols = Chemistry::Mol->read("file.sln", format => 'sln'); # write a multiline SLN file Chemistry::Mol->write("file.sln", mols => [@mols]); =head1 DESCRIPTION This module parses a SLN (Sybyl Line Notation) string. This is a File I/O driver for the PerlMol project. L. It registers the 'sln' format with Chemistry::Mol, and recognizes filenames ending in '.sln'. Optional attributes for atoms, bonds, and molecules are stored as $atom->attr("sln/attr"), $bond->attr("sln/attr"), and $mol->attr("sln/attr"), respectively. Boolean attributes are stored with a value of 'TRUE'. That's the way boolean attributes are recognized when writing, so that they can be written in the shortened form. $sln_attr->{backbone} = 1; # would be ouput as "C[backbone=1]" $sln_attr->{backbone} = 'TRUE'; # would be ouput as "C[backbone]" Also note that attribute names are normalized to lowercase on reading. =head1 OPTIONS The following options are available when reading: =over =item kekulize Assign bond orders for unsatisfied valences or for aromatic bonds. For example, benzene read as C[1]H:CH:CH:CH:CH:CH@1 will be converted internally to something like C[1]H=CHCH=CHCH=CH@1. This is needed if another format or module expects a Kekule representation without an aromatic bond type. =back The following options are available when writing: =over =item mols If this option points to an array of molecules, these molecules will be written, one per line, as in the example in the SYNOPSYS. =item aromatic Detect aromaticity before writing. This will ensure that aromatic bond types are used instead of alternate single and double bonds. =item unique Canonicalize before writing, and produce a unique strucure. NOTE: this option does not guarantee a unique representation for molecules with bracketed attributes. =item name Include the name of the molecule ($mol->name) in the output string. =item coord3d, coords Include the 3D coordinates of every atom in the molecule in the output string. C and C may be used interchangeably. =item attr Output the atom, bond, and molecule attributes found in $mol->attr("sln/attr"), etc. =back =head1 CAVEATS This version does not implement the full SLN specification. It supports simple structures and some attributes, but it does not support any of the following: =over =item Macro atoms =item Pattern matching options =item Markush structures =item 2D Coordinates =back The SLN specification is vague on several points, and I don't have a reference implementation available, so I had to make several arbitrary decisions. Also, this version of this module has not been tested exhaustively, so please report any bugs that you find. If the parser doesn't understand a string, it only says "syntax error", which may not be very helpful. =cut # INITIALIZATION Chemistry::Mol->register_format('sln'); my $Parser = Chemistry::File::SLN::Parser->new; sub name_is { my ($self, $name) = @_; $name =~ /\.sln$/i; } sub file_is { $_[0]->name_is($_[1]); } sub parse_string { my ($self, $string, %opts) = @_; my (@lines) = split /(?:\n|\r\n?)/, $string; my @mols; for my $line (@lines) { my $mol = $self->parse_single_line($line, %opts); return $mol unless wantarray; push @mols, $mol; } @mols; } sub parse_single_line { my ($self, $string, %opts) = @_; my $mol_class = $opts{mol_class} || "Chemistry::Mol"; # call the actual yapp-generated parser my $tree = $Parser->run($string) or return; #use Data::Dumper; print Dumper $tree; my $mol = $mol_class->new; my @nodes = @{$tree->{chain}}; my %closures; my $last_atom; my @stack; while (my $node = shift @nodes) { if ($node eq '(') { push @stack, $last_atom; } elsif ($node eq ')') { $last_atom = pop @stack; } elsif($last_atom) { # bond my $next = shift @nodes; if ($next->{closure}) { my $atom = $closures{$next->{closure}}; $self->compile_bond($mol, $node, $last_atom, $atom); } else { my $atom = $self->compile_atom($mol, $next, \%closures); $self->compile_bond($mol, $node, $last_atom, $atom); $last_atom = $atom; } } else { # first atom $last_atom = $self->compile_atom($mol, $node, \%closures); } } if ($opts{kekulize}) { assign_bond_orders($mol, method => "itub", use_coords => 0, scratch => 0, charges => 0); } my @sln_attr; while (my ($attr, $value) = each %{$tree->{attr}}) { if ($attr eq 'name') { $mol->name($value); } elsif ($attr eq 'type') { $mol->type($value); } elsif ($attr eq 'coord3d') { $self->read_coords($mol, $value); } else { push @sln_attr, $attr, $value; } } $mol->attr("sln/attr", {@sln_attr}) if @sln_attr; $mol; } sub compile_atom { my ($self, $mol, $node, $closures) = @_; my $atom = $mol->new_atom( symbol => $node->{symbol}, hydrogens => $node->{hcount}, formal_charge => $node->{attr}{charge}, ); $atom->attr("sln/attr", $node->{attr}); delete $node->{attr}{charge}; $closures->{$node->{id}} = $atom if $node->{id}; $atom; } my %TYPE_TO_ORDER = ( '-' => 1, '=' => 2, '#' => 3, ':' => 1, '.' => 0, ); sub compile_bond { my ($self, $mol, $node, $atom1, $atom2) = @_; my $order = $TYPE_TO_ORDER{$node->{type}}; if ($order) { my $bond = $mol->new_bond( type => $node->{type}, atoms=>[$atom1, $atom2], order => $order, ); $bond->attr("sln/attr", $node->{attr}); if ($node->{type} eq ':') { $_->aromatic(1) for ($atom1, $atom2, $bond); } } } sub read_coords { my ($self, $mol, $coords_str) = @_; $coords_str =~ s/[()]//g; my (@coords) = split /,/, $coords_str; my $fh = $mol->formula_hash; my $n = sum(values %$fh); my $sprout = (@coords == 3*$n); for my $atom ($mol->atoms) { $atom->coords(splice @coords, 0, 3); if ($sprout) { for (1 .. $atom->implicit_hydrogens) { my $H = $mol->new_atom(symbol => 'H', coords => [splice @coords, 0, 3]); $mol->new_bond(atoms => [$atom, $H]); } $atom->implicit_hydrogens(0); } } } ########### WRITER ################# sub write_string { my ($self, $mol_ref, %opts) = @_; my $eol; my @mols; if ($opts{mols}) { @mols = @{$opts{mols}}; $eol = "\n"; } else { @mols = $mol_ref; $eol = ""; } my $sln; for my $mol (@mols) { $sln .= $self->write_mol($mol, %opts) . $eol; } $sln; } sub write_mol { my ($self, $mol, %opts) = @_; my $oldmol = $mol; $mol = $mol->clone; my $sln = ''; my @id_log; if ($mol->atoms) { my @atoms = $self->clean_mol($mol, %opts); my $visited = {}; my @s; for my $atom (@atoms) { next if $visited->{$atom}; my $ring_atoms = {}; # first pass to find and number the ring bonds $self->find_ring_bonds($mol, \%opts, $atom, undef, {}, $ring_atoms); # second pass to actually generate the sln string push @s, $self->branch($mol, \%opts, $atom, undef, $visited, $ring_atoms, \@id_log); } $sln .= join '.', @s; } $sln .= $self->format_ctab_attr($mol, \%opts, $oldmol, \@id_log); } sub clean_mol { my ($self, $mol, %opts) = @_; $self->collapse_hydrogens($mol); my @atoms = $mol->atoms; if ($opts{unique}) { unless ($atoms[0]->attr("canon/class")) { require Chemistry::Canonicalize; Chemistry::Canonicalize::canonicalize($mol); } #$opts{aromatic} = 1; # all unique sln have to be aromatic @atoms = sort { $a->attr("canon/class") <=> $b->attr("canon/class") } @atoms; } if ($opts{aromatic}) { require Chemistry::Ring; Chemistry::Ring::aromatize_mol($mol); } @atoms; } sub format_ctab_attr { my ($self, $mol, $opts, $oldmol, $id_log) = @_; my $sln = ''; if ($opts->{name} or $opts->{attr} or $opts->{coords} or $opts->{coord3d}) { no warnings 'uninitialized'; my @attr; my $name = $mol->name; $name =~ s/[\r\n]//g; push @attr, 'name="' . $mol->name . '"' if $opts->{name} and length $mol->name; my @coords; if ($opts->{coord3d} or $opts->{coords}) { my @all_atoms = map { ( $oldmol->by_id($_), grep {$_->symbol eq 'H'} $oldmol->by_id($_)->neighbors ) } @$id_log; push @coords, sprintf("(%.3f,%.3f,%.3f)",$_->coords->array) for @all_atoms; push @attr, 'coord3d=' . join(',',@coords); } if ($opts->{attr}) { push @attr, $self->format_sln_attr($mol); } $sln .= '<' . join(';', @attr) . '>' if @attr; } $sln; } sub find_ring_bonds { my ($self, $mol, $opts, $atom, $from_bond, $visited, $ring_atoms) = @_; $visited->{$atom} = 1; for my $bn (sorted_bonds_neighbors($atom, $opts)) { my $nei = $bn->{to}; my $bond = $bn->{bond}; next if $visited->{$bond}; $visited->{$bond} = 1; if ($visited->{$nei}) { # closed ring #print "closing ring\n"; $ring_atoms->{$nei}++; } else { $self->find_ring_bonds($mol, $opts, $nei, $bond, $visited, $ring_atoms); } } } sub branch { my ($self, $mol, $opts, $atom, $from_bond, $visited, $digits, $id_log) = @_; my $prev_branch = ""; my $sln; $sln .= $self->format_bond($from_bond, $opts); my $digit; if ($digits->{$atom}) { # opening a ring $digit = $self->next_digit($digits); $digits->{$atom} = $digit; } $sln .= $self->format_atom($atom, $opts, $digit); push @$id_log, $atom->id; $visited->{$atom} = 1; my @bns = sorted_bonds_neighbors($atom, $opts); for my $bn (@bns) { my $nei = $bn->{to}; my $bond = $bn->{bond}; next if $visited->{$bond}; $visited->{$bond} = 1; if ($visited->{$nei}) { # closed a ring if ($prev_branch) { $sln .= "($prev_branch)"; } $prev_branch = $self->format_bond($bond, $opts) . '@' . $digits->{$nei}; $visited->{$bond} = 1; } else { my $branch = $self->branch($mol, $opts, $nei, $bond, $visited, $digits, $id_log); if ($prev_branch) { $sln .= "($prev_branch)"; } $prev_branch = $branch; } } $sln .= "$prev_branch"; $sln; } sub next_digit { my ($self, $digits) = @_; ++$digits->{used_digits}; } sub collapse_hydrogens { my ($self, $mol) = @_; for my $atom (grep {$_->symbol eq 'H'} $mol->atoms) { my ($neighbor) = $atom->neighbors or next; $atom->delete; my $h_count = $neighbor->hydrogens; $h_count++; $neighbor->hydrogens($h_count); } } sub sorted_bonds_neighbors { my ($atom, $opts) = @_; my @bn = $atom->bonds_neighbors; if ($opts->{unique}) { @bn = sort { $a->{to}->attr("canon/class") <=> $b->{to}->attr("canon/class") } @bn; } @bn; } my %ORDER_TO_TYPE = ( 1 => '', 2 => '=', 3 => '#', 4 => '', 0 => '.', ); sub format_bond { my ($self, $bond, $opts) = @_; return '' unless $bond; my $s = $bond->aromatic ? ':' : $ORDER_TO_TYPE{$bond->order}; my @attr; @attr = $self->format_sln_attr($bond) if $opts->{attr}; if (@attr) { $s .= '[' . join(";", @attr) . ']'; } $s; } sub format_atom { my ($self, $atom, $opts, $digit) = @_; my $s; no warnings 'uninitialized'; my $h_count = $atom->hydrogens; my $charge = $atom->formal_charge; my $symbol = $atom->symbol; $charge = $charge ? sprintf("%+d", $charge): ''; $h_count = $h_count ? ($h_count > 1 ? "H$h_count" : 'H') : ''; $s = $symbol; my @attr; @attr = $self->format_sln_attr($atom) if $opts->{attr}; if ($charge or $digit or @attr) { $s .= '['; $s .= $digit; unshift @attr, $charge if $charge; if (@attr) { $s .= ':' if $digit; $s .= join ';', @attr; } $s .= ']'; } $s .= $h_count; $s; } sub format_sln_attr { my ($self, $obj) = @_; my $sln_attr = $obj->attr("sln/attr") || {}; my @attr; for my $key (sort keys %$sln_attr) { my $val = $sln_attr->{$key}; push @attr, "$key" . ($val eq 'TRUE' ? "" : "=$val"); } @attr; } 1; =head1 VERSION 0.11 =head1 SEE ALSO L, L, L The PerlMol website L Ash, S.; Cline, M. A.; Homer, R. W.; Hurst, T.; Smith, G. B., SYBYL Line Notation (SLN): A Versatile Language for Chemical Structure Representation. J. Chem. Inf. Comput. Sci; 1997; 37(1); 71-79. DOI: 10.1021/ci960109j (L) =head1 AUTHOR Ivan Tubert-Brohman Eitub@cpan.orgE =head1 COPYRIGHT Copyright (c) 2004 Ivan Tubert-Brohman. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut