package IntervalTree; use 5.006; use POSIX qw(ceil); use List::Util qw(max min); use strict; use warnings; no warnings 'once'; our $VERSION = '0.03'; =head1 NAME IntervalTree.pm =head1 VERSION Version 0.01 =head1 DESCRIPTION Data structure for performing intersect queries on a set of intervals which preserves all information about the intervals (unlike bitset projection methods). =cut # Historical note: # This module original contained an implementation based on sorted endpoints # and a binary search, using an idea from Scott Schwartz and Piotr Berman. # Later an interval tree implementation was implemented by Ian for Galaxy's # join tool (see `bx.intervals.operations.quicksect.py`). This was then # converted to Cython by Brent, who also added support for # upstream/downstream/neighbor queries. This was modified by James to # handle half-open intervals strictly, to maintain sort order, and to # implement the same interface as the original Intersecter. =head1 SYNOPSIS Data structure for performing window intersect queries on a set of of possibly overlapping 1d intervals. Usage ===== Create an empty IntervalTree >>> use IntervalTree; >>> my $intersecter = IntervalTree->new(); An interval is a start and end position and a value (possibly None). You can add any object as an interval: >>> $intersecter->insert( 0, 10, "food" ); >>> $intersecter->insert( 3, 7, {foo=>'bar'} ); >>> $intersecter->find( 2, 5 ); ['food', {'foo'=>'bar'}] If the object has start and end attributes (like the IntervalTree::Interval class) there is are some shortcuts: >>> my $intersecter = IntervalTree->new(); >>> $intersecter->insert_interval( IntervalTree::Interval->new( 0, 10 ) ); >>> $intersecter->insert_interval( IntervalTree::Interval->new( 3, 7 ) ); >>> $intersecter->insert_interval( IntervalTree::Interval->new( 3, 40 ) ); >>> $intersecter->insert_interval( IntervalTree::Interval->new( 13, 50 ) ); >>> $intersecter->find( 30, 50 ); [IntervalTree::Interval(3, 40), IntervalTree::Interval(13, 50)] >>> $intersecter->find( 100, 200 ); [] Before/after for intervals >>> $intersecter->before_interval( IntervalTree::Interval->new( 10, 20 ) ); [IntervalTree::Interval(3, 7)] >>> $intersecter->before_interval( IntervalTree::Interval->new( 5, 20 ) ); [] Upstream/downstream >>> $intersecter->upstream_of_interval(IntervalTree::Interval->new(11, 12)); [IntervalTree::Interval(0, 10)] >>> $intersecter->upstream_of_interval(IntervalTree::Interval->new(11, 12, undef, undef, "-")); [IntervalTree::Interval(13, 50)] >>> intersecter.upstream_of_interval(IntervalTree::Interval(1, 2, undef, undef, "-"), 3); [IntervalTree::Interval(3, 7), IntervalTree::Interval(3, 40), IntervalTree::Interval(13, 50)] =cut sub new { my ( $class ) = @_; my $self = {}; $self->{root} = undef; return bless $self, $class; } # ---- Position based interfaces ----------------------------------------- =head2 insert Insert the interval [start,end) associated with value `value`. =cut sub insert { my ( $self, $start, $end, $value) = @_; if (!defined $self->{root}) { $self->{root} = IntervalTree::Node->new( $start, $end, $value ); } else { $self->{root} = $self->{root}->insert( $start, $end, $value ); } } *add = \&insert; =head2 find Return a sorted list of all intervals overlapping [start,end). =cut sub find { my ( $self, $start, $end ) = @_; if (!defined $self->{root}) { return []; } return $self->{root}->find( $start, $end ); } =head2 before Find `num_intervals` intervals that lie before `position` and are no further than `max_dist` positions away =cut sub before { my ( $self, $position, $num_intervals, $max_dist ) = @_; $num_intervals = 1 if !defined $num_intervals; $max_dist = 2500 if !defined $max_dist; if (!defined $self->{root}) { return []; } return $self->{root}->left( $position, $num_intervals, $max_dist ); } =head2 after Find `num_intervals` intervals that lie after `position` and are no further than `max_dist` positions away =cut sub after { my ( $self, $position, $num_intervals, $max_dist) = @_; $num_intervals = 1 if !defined $num_intervals; $max_dist = 2500 if !defined $max_dist; if (!defined $self->{root}) { return []; } return $self->{root}->right( $position, $num_intervals, $max_dist ); } # ---- Interval-like object based interfaces ----------------------------- =head2 insert_interval Insert an "interval" like object (one with at least start and end attributes) =cut sub insert_interval { my ( $self, $interval ) = @_; $self->insert( $interval->{start}, $interval->{end}, $interval ); } *add_interval = \&insert_interval; =head2 before_interval Find `num_intervals` intervals that lie completely before `interval` and are no further than `max_dist` positions away =cut sub before_interval { my ( $self, $interval, $num_intervals, $max_dist ) = @_; $num_intervals = 1 if !defined $num_intervals; $max_dist = 2500 if !defined $max_dist; if (!defined $self->{root}) { return []; } return $self->{root}->left( $interval->{start}, $num_intervals, $max_dist ); } =head2 after_interval Find `num_intervals` intervals that lie completely after `interval` and are no further than `max_dist` positions away =cut sub after_interval { my ( $self, $interval, $num_intervals, $max_dist ) = @_; $num_intervals = 1 if !defined $num_intervals; $max_dist = 2500 if !defined $max_dist; if (!defined $self->{root}) { return []; } return $self->{root}->right( $interval->{end}, $num_intervals, $max_dist ); } =head2 upstream_of_interval Find `num_intervals` intervals that lie completely upstream of `interval` and are no further than `max_dist` positions away =cut sub upstream_of_interval { my ( $self, $interval, $num_intervals, $max_dist ) = @_; $num_intervals = 1 if !defined $num_intervals; $max_dist = 2500 if !defined $max_dist; if (!defined $self->{root}) { return []; } if ($interval->{strand} == -1 || $interval->{strand} eq "-") { return $self->{root}->right( $interval->{end}, $num_intervals, $max_dist ); } else { return $self->{root}->left( $interval->{start}, $num_intervals, $max_dist ); } } =head2 downstream_of_interval Find `num_intervals` intervals that lie completely downstream of `interval` and are no further than `max_dist` positions away =cut sub downstream_of_interval { my ( $self, $interval, $num_intervals, $max_dist ) = @_; $num_intervals = 1 if !defined $num_intervals; $max_dist = 2500 if !defined $max_dist; if (!defined $self->{root}) { return []; } if ($interval->{strand} == -1 || $interval->{strand} eq "-") { return $self->{root}->left( $interval->{start}, $num_intervals, $max_dist ); } else { return $self->{root}->right( $interval->{end}, $num_intervals, $max_dist ); } } =head2 traverse call fn for each element in the tree =cut sub traverse { my ($self, $fn) = @_; if (!defined $self->{root}) { return undef; } return $self->{root}->traverse($fn); } =head2 IntervalTree::Node A single node of an `IntervalTree`. NOTE: Unless you really know what you are doing, you probably should us `IntervalTree` rather than using this directly. =cut package IntervalTree::Node; use List::Util qw(min max); our $EmptyNode = IntervalTree::Node->new( 0, 0, IntervalTree::Interval->new(0, 0)); sub nlog { return -1.0 / log(0.5); } sub left_node { my ($self) = @_; return $self->{cleft} != $EmptyNode ? $self->{cleft} : undef; } sub right_node { my ($self) = @_; return $self->{cright} != $EmptyNode ? $self->{cright} : undef; } sub root_node { my ($self) = @_; return $self->{croot} != $EmptyNode ? $self->{croot} : undef; } sub str { my ($self) = @_; return "Node($self->{start}, $self->{end})"; } sub new { my ($class, $start, $end, $interval) = @_; # Perl lacks the binomial distribution, so we convert a # uniform into a binomial because it naturally scales with # tree size. Also, perl's uniform is perfect since the # upper limit is not inclusive, which gives us undefined here. my $self = {}; $self->{priority} = POSIX::ceil(nlog() * log(-1.0/(1.0 * rand() - 1))); $self->{start} = $start; $self->{end} = $end; $self->{interval} = $interval; $self->{maxend} = $end; $self->{minstart} = $start; $self->{minend} = $end; $self->{cleft} = $EmptyNode; $self->{cright} = $EmptyNode; $self->{croot} = $EmptyNode; return bless $self, $class; } =head2 insert Insert a new IntervalTree::Node into the tree of which this node is currently the root. The return value is the new root of the tree (which may or may not be this node!) =cut sub insert { my ($self, $start, $end, $interval) = @_; my $croot = $self; # If starts are the same, decide which to add interval to based on # end, thus maintaining sortedness relative to start/end my $decision_endpoint = $start; if ($start == $self->{start}) { $decision_endpoint = $end; } if ($decision_endpoint > $self->{start}) { # insert to cright tree if ($self->{cright} != $EmptyNode) { $self->{cright} = $self->{cright}->insert( $start, $end, $interval ); } else { $self->{cright} = IntervalTree::Node->new( $start, $end, $interval ); } # rebalance tree if ($self->{priority} < $self->{cright}{priority}) { $croot = $self->rotate_left(); } } else { # insert to cleft tree if ($self->{cleft} != $EmptyNode) { $self->{cleft} = $self->{cleft}->insert( $start, $end, $interval); } else { $self->{cleft} = IntervalTree::Node->new( $start, $end, $interval); } # rebalance tree if ($self->{priority} < $self->{cleft}{priority}) { $croot = $self->rotate_right(); } } $croot->set_ends(); $self->{cleft}{croot} = $croot; $self->{cright}{croot} = $croot; return $croot; } sub rotate_right { my ($self) = @_; my $croot = $self->{cleft}; $self->{cleft} = $self->{cleft}{cright}; $croot->{cright} = $self; $self->set_ends(); return $croot; } sub rotate_left { my ($self) = @_; my $croot = $self->{cright}; $self->{cright} = $self->{cright}{cleft}; $croot->{cleft} = $self; $self->set_ends(); return $croot; } sub set_ends { my ($self) = @_; if ($self->{cright} != $EmptyNode && $self->{cleft} != $EmptyNode) { $self->{maxend} = max($self->{end}, $self->{cright}{maxend}, $self->{cleft}{maxend}); $self->{minend} = min($self->{end}, $self->{cright}{minend}, $self->{cleft}{minend}); $self->{minstart} = min($self->{start}, $self->{cright}{minstartz}, $self->{cleft}{minstart}); } elsif ( $self->{cright} != $EmptyNode) { $self->{maxend} = max($self->{end}, $self->{cright}{maxend}); $self->{minend} = min($self->{end}, $self->{cright}{minend}); $self->{minstart} = min($self->{start}, $self->{cright}{minstart}); } elsif ( $self->{cleft} != $EmptyNode) { $self->{maxend} = max($self->{end}, $self->{cleft}{maxend}); $self->{minend} = min($self->{end}, $self->{cleft}{minend}); $self->{minstart} = min($self->{start}, $self->{cleft}{minstart}); } } =head2 intersect given a start and a end, return a list of features falling within that range =cut sub intersect { my ( $self, $start, $end, $sort ) = @_; $sort = 1 if !defined $sort; my $results = []; $self->_intersect( $start, $end, $results ); return $results; } *find = \&intersect; sub _intersect { my ( $self, $start, $end, $results) = @_; # Left subtree if ($self->{cleft} != $EmptyNode && $self->{cleft}{maxend} > $start) { $self->{cleft}->_intersect( $start, $end, $results ); } # This interval if (( $self->{end} > $start ) && ( $self->{start} < $end )) { push @$results, $self->{interval}; } # Right subtree if ($self->{cright} != $EmptyNode && $self->{start} < $end) { $self->{cright}->_intersect( $start, $end, $results ); } } sub _seek_left { my ($self, $position, $results, $n, $max_dist) = @_; # we know we can bail in these 2 cases. if ($self->{maxend} + $max_dist < $position) { return; } if ($self->{minstart} > $position) { return; } # the ordering of these 3 blocks makes it so the results are # ordered nearest to farest from the query position if ($self->{cright} != $EmptyNode) { $self->{cright}->_seek_left($position, $results, $n, $max_dist); } if (-1 < $position - $self->{end} && $position - $self->{end} < $max_dist) { push @$results, $self->{interval}; } # TODO: can these conditionals be more stringent? if ($self->{cleft} != $EmptyNode) { $self->{cleft}->_seek_left($position, $results, $n, $max_dist); } } sub _seek_right { my ($self, $position, $results, $n, $max_dist) = @_; # we know we can bail in these 2 cases. return if $self->{maxend} < $position; return if $self->{minstart} - $max_dist > $position; #print "SEEK_RIGHT:",self, self.cleft, self.maxend, self.minstart, position # the ordering of these 3 blocks makes it so the results are # ordered nearest to farest from the query position if ($self->{cleft} != $EmptyNode) { $self->{cleft}->_seek_right($position, $results, $n, $max_dist); } if (-1 < $self->{start} - $position && $self->{start} - $position < $max_dist) { push @$results, $self->{interval}; } if ($self->{cright} != $EmptyNode) { $self->{cright}->_seek_right($position, $results, $n, $max_dist); } } =head2 left find n features with a start > than `position` f: a IntervalTree::Interval object (or anything with an `end` attribute) n: the number of features to return max_dist: the maximum distance to look before giving up. =cut sub left { my ($self, $position, $n, $max_dist) = @_; $n = 1 if !defined $n; $max_dist = 2500 if !defined $max_dist; my $results = []; # use start - 1 becuase .left() assumes strictly left-of $self->_seek_left( $position - 1, $results, $n, $max_dist ); return $results if length(@$results) == $n; my $r = $results; @$r = sort {$b->{end} <=> $a->{end}} @$r; return @$r[0..$n-1]; } =head2 right find n features with a end < than position f: a IntervalTree::Interval object (or anything with a `start` attribute) n: the number of features to return max_dist: the maximum distance to look before giving up. =cut sub right { my ($self, $position, $n, $max_dist) = @_; $n = 1 if !defined $n; $max_dist = 2500 if !defined $max_dist; my $results = []; # use end + 1 because .right() assumes strictly right-of $self->_seek_right($position + 1, $results, $n, $max_dist); return $results if length(@$results) == $n; my $r = $results; @$r = sort {$a->{start} <=> $b->{start}} @$r; return @$r[0..$n-1]; } sub traverse { my ($self, $func) = @_; $self->_traverse($func); } sub _traverse { my ($self, $func) = @_; $self->{cleft}->_traverse($func) if $self->{cleft} != $EmptyNode; $func->($self); $self->{cright}->_traverse($func) if $self->{cright} != $EmptyNode; } ## ---- Wrappers that retain the old interface ------------------------------- =head2 IntervalTree::Interval Basic feature, with required integer start and end properties. Also accepts optional strand as +1 or -1 (used for up/downstream queries), a name, and any arbitrary data is sent in on the info keyword argument >>> from bx.intervals.intersection import IntervalTree::Interval >>> f1 = IntervalTree::Interval->new(23, 36); >>> f2 = IntervalTree::Interval->new(34, 48, {'chr':12, 'anno':'transposon'}); >>> f2 Interval(34, 48, {'anno': 'transposon', 'chr': 12}) =cut package IntervalTree::Interval; sub new { my ($class, $start, $end, $value, $chrom, $strand) = @_; die "start must be less than end" unless $start <= $end; my $self = {}; $self->{start} = $start; $self->{end} = $end; $self->{value} = $value; $self->{chrom} = $chrom; $self->{strand} = $strand; return bless $self, $class; } sub str { my ($self) = @_; my $fstr = "Interval($self->{start}, $self->{end}"; if (defined($self->{value})) { $fstr .= ", value=$self->{value}"; } $fstr .= ")"; return $fstr; } =head2 AUTHOR Ben Booth, C<< >> Original Authors: James Taylor (james@jamestaylor.org), Ian Schenk (ian.schenck@gmail.com), Brent Pedersen (bpederse@gmail.com) =head2 BUGS Please report any bugs or feature requests to C, or through the web interface at L. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes. =head2 SUPPORT You can find documentation for this module with the perldoc command. perldoc IntervalTree You can also look for information at: =over 4 =item * RT: CPAN's request tracker (report bugs here) L =item * AnnoCPAN: Annotated CPAN documentation L =item * CPAN Ratings L =item * Search CPAN L =back =head2 ACKNOWLEDGEMENTS This code was directly ported from the bx-python project: https://bitbucket.org/james_taylor/bx-python/src/tip/lib/bx/intervals/intersection.pyx Original Authors: James Taylor (james@jamestaylor.org), Ian Schenk (ian.schenck@gmail.com), Brent Pedersen (bpederse@gmail.com) =head2 LICENSE AND COPYRIGHT Copyright 2012 Ben Booth. This program is free software; you can redistribute it and/or modify it under the terms of either: the GNU General Public License as published by the Free Software Foundation; or the Artistic License. See http://dev.perl.org/licenses/ for more information. =cut 1; # End of IntervalTree