Source code for ete4.phylo.phylotree

"""
This module defines the PhyloTree class to manage phylogenetic trees.
It inherits from Tree and adds some special features to the the node
instances.
"""

import sys
import re
import warnings
import itertools
from collections import defaultdict
from ete4 import Tree, SeqGroup, NCBITaxa, GTDBTaxa
from .reconciliation import get_reconciled_tree
from . import spoverlap

__all__ = ["PhyloTree"]


def is_dup(n):
    return n.props.get("evoltype") == "D"

def get_subtrees(tree, full_copy=False, properties=None, newick_only=False):
    """Calculate all possible species trees within a gene tree. I
    tested several recursive and iterative approaches to do it and
    this is the most efficient way I found. The method is now fast and
    light enough to deal with very large gene trees, and it scales
    linearly instead of exponentially. For instance, a tree with ~8000
    nodes, ~100 species and ~400 duplications returns ~10,000 sptrees
    that could be loaded in few minutes.

    To avoid memory overloads, this function returns a tuple containing the
    total number of trees, number of duplication events, and an iterator for the
    species trees. Real trees are not actually computed until the iterator is
    first accessed. This allows to filter out cases producing astronomic numbers
    of sptrees.

    """
    ntrees, ndups = calc_subtrees(tree)
    return ntrees, ndups, _get_subtrees(tree, full_copy, properties, newick_only)

def _get_subtrees(tree, full_copy=False, properties=None, newick_only=False):
    # First I need to precalculate all the species trees in tuple (newick) format
    nid = 0
    n2nid = {}
    nid2node = {}
    n2subtrees = defaultdict(list)
    for n in tree.traverse("postorder"):
        n2nid[n] = nid
        nid2node[nid] = n
        nid += 1
        if n.children:
            if is_dup(n):
                subtrees = []
                for ch in n.children:
                    subtrees.extend(n2subtrees[n2nid[ch]])
            else:
                subtrees = tuple([val for val in
                                  itertools.product(n2subtrees[n2nid[n.children[0]]],
                                                    n2subtrees[n2nid[n.children[1]]])])
        else:
            subtrees = tuple([n2nid[n]])

        n2subtrees[n2nid[n]] = subtrees
        for ch in n.children:
            del n2subtrees[n2nid[ch]]

    sp_trees = n2subtrees[n2nid[tree]]

    # Second, I yield a tree per iteration in newick or ETE format
    properties = set(properties) if properties else set()
    properties.update(["name"])

    def _nodereplacer(match):
        pre, b, post =  match.groups()
        pre = '' if not pre else pre
        post = '' if not post else post
        node = nid2node[int(b)]
        fstring = ""
        if properties:
            fstring = "".join(["[&&NHX:",
                               ':'.join(["%s=%s" %(p, node.props.get(p))
                                         for p in properties if node.props.get(p)])
                               , "]"])

        return ''.join([pre, node.name, fstring, post])

    if newick_only:
        id_match = re.compile(r"([^0-9])?(\d+)([^0-9])?")
        for nw in sp_trees:
            yield re.sub(id_match, _nodereplacer, str(nw)+";")
    else:
        for nw in sp_trees:
            # I take advantage from the fact that I generated the subtrees
            # using tuples, so str representation is actually a newick :)
            t = PhyloTree(str(nw)+";")
            # Map properties from original tree
            for leaf in t.leaves():
                _nid = int(leaf.name)
                for p in properties:
                    leaf.add_prop(p, getattr(nid2node[_nid], p))
            yield t

def calc_subtrees(tree):
    """Return the number of species and duplications for the given tree.

    The ones that the TreeKO algorithm would produce.
    """
    n2subtrees = {}
    dups = 0
    for n in tree.traverse("postorder"):
        if n.children:
            if is_dup(n):
                dups += 1
                subtrees = 0
                for ch in n.children:
                    subtrees += n2subtrees[ch]
            else:
                subtrees = n2subtrees[n.children[0]] * n2subtrees[n.children[1]]
        else:
            subtrees = 1
        n2subtrees[n] = subtrees
    return n2subtrees[tree], dups

def iter_sptrees(sptrees, nid2node, properties=None, newick_only=False):
    """ Loads and map the species trees returned by get_subtrees"""

    properties = set(properties) if properties else set()
    properties.update(["name"])

    def _nodereplacer(match):
        pre, b, post =  match.groups()
        node = nid2node[int(b)]
        fstring = ""
        if properties:
            fstring = "".join(["[&&NHX:",
                               ','.join(["%s=%s" %(p, node.props.get(p))
                                         for p in properties if node.props.get(p)])
                               , "]"])

        return ''.join([pre, node.name, fstring, post])

    if newick_only:
        id_match = re.compile(r"([^0-9])(\d+)([^0-9])")
        for nw in sptrees:
            yield re.sub(id_match, _nodereplacer, str(nw)+";")
    else:
        for nw in sptrees:
            # I take advantage from the fact that I generated the subtrees
            # using tuples, so str representation is actually a newick :)
            t = PhyloTree(str(nw)+";")
            # Map properties from original tree
            for leaf in t.leaves():
                _nid = int(leaf.name)
                for p in properties:
                    leaf.add_prop(p, getattr(nid2node[_nid], p))
            yield t

def _get_subtrees_recursive(node, full_copy=True):
    if is_dup(node):
        sp_trees = []
        for ch in node.children:
            sp_trees.extend(_get_subtrees_recursive(ch, full_copy=full_copy))
        return sp_trees

    # saves a list of duplication nodes under current node
    dups = []
    for _n in node.leaves(is_leaf_fn=is_dup):
        if is_dup(_n):
            dups.append(_n)

    if dups:
        # detach inner duplication nodes and stores their anchor point
        subtrees = []
        for dp in dups:
            # The real node to attach sibling subtress
            anchor = dp.up
            dp.detach()

            duptrees = []
            #get all sibling sptrees in each side of the
            #duplication. Each subtree is pointed to its anchor
            for ch in dp.children:
                for subt in _get_subtrees_recursive(ch, full_copy=full_copy):
                    if not full_copy:
                        subt = node.__class__(subt)
                    subt.up = anchor
                    duptrees.append(subt)

            #all posible sptrees under this duplication are stored
            subtrees.append(duptrees)

        # Generates all combinations of subtrees in sibling duplications
        sp_trees = []
        for comb in itertools.product(*subtrees):
            #each subtree is attached to its anchor point and make a copy
            #of the final sp tree
            for subt in comb:
                #anchor = subt2anchor[subt]
                if subt.up:
                    subt.up.children.append(subt)
                    #print subt.up
                else:
                    sp_trees.append(subt)
            if full_copy:
                 back_up = node.up
                 node.up = None
                 _node = node.copy()
                 node.up = back_up
            else:
                _node = node.write(format=9, properties=["name", "evoltype"])
            sp_trees.append(_node)
            # Clear current node
            for subt in comb:
                subt.up.children.pop(-1)
    else:
        if full_copy:
            back_up = node.up
            node.up = None
            _node = node.copy()
            node.up = back_up
        else:
            _node = node.write(format=9, properties=["name", "evoltype"])
        #node.detach()
        sp_trees = [_node]

    return sp_trees

def get_subparts(n):
    subtrees = []
    if is_dup(n):
        for ch in n.get_children():
            ch.detach()
            subtrees.extend(get_subparts(ch))
    else:
        to_visit = []
        for _n in n.leaves(is_leaf_fn=is_dup):
            if is_dup(_n):
                to_visit.append(_n)

        for _n in to_visit:
            _n.detach()

        freaks = [_n for _n in n.descendants() if
                  len(_n.children)==1 or (not hasattr(_n, "_leaf") and not _n.children)]
        for s in freaks:
            s.delete(prevent_nondicotomic=True)

        # Clean node structure to prevent nodes with only one child
        while len(n.children) == 1:
            n = n.children[0]
            n.detach()

        if not n.children and not hasattr(n, "_leaf"):
            pass
        else:
            subtrees.append(n)

        for _n in to_visit:
            subtrees.extend(get_subparts(_n))

    return subtrees



[docs]
class PhyloTree(Tree):
    """
    Class to store a phylogenetic tree.

    Extends the standard :class:`Tree` instance by adding
    specific properties and methods to work with phylogentic trees.
    """


[docs]
    def __init__(self, newick=None, children=None, alignment=None,
                 alg_format="fasta", sp_naming_function=None,
                 parser=None):
        """
        :param newick: If not None, initializes the tree from a newick,
            which can be a string or file object containing it.
        :param children: If not None, the children to add to this node.
        :param alignment: File containing a multiple sequence alignment.
        :param alg_format: "fasta", "phylip" or "iphylip" (interleaved).
        :param parser: Parser to read the newick.
        :param sp_naming_function: Function that gets a node name and
            returns the species name (see
            :func:`PhyloTree.set_species_naming_function`). By default,
            the 3 first letters of node names will be used as species
            identifier.
        """
        super().__init__(data=newick, children=children, parser=parser)

        # This will be only executed after reading the whole tree,
        # because the argument 'alignment' is not passed to the
        # PhyloTree constructor during parsing.
        if alignment:
            self.link_to_alignment(alignment, alg_format)

        if newick:
            self.set_species_naming_function(sp_naming_function)


    @property
    def species(self):
        if self.props.get('_speciesFunction'):
            if 'species' in self.props:
                warnings.warn('Ambiguous species: both species and _speciesFunction'
                             'defined. You can remove "species" from this node.')
            try:
                return self.props.get('_speciesFunction')(self.name)
            except:
                return self.props.get('_speciesFunction')(self)
        else:
            return self.props.get('species')

    @species.setter
    def species(self, value):
        assert self.props.get('_speciesFunction') is None, \
            ('Species naming function present, cannot set species manually. '
             'Maybe call set_species_naming_function() first?')
        self.props['species'] = value

    def __repr__(self):
        return "PhyloTree '%s' (%s)" % (self.name, hex(self.__hash__()))


[docs]
    def write(self, outfile=None, props=(), parser=None,
              format_root_node=False, is_leaf_fn=None):
        if props is None:
            props = sorted(set(p for node in self.traverse()
                               for p in node.props if not p.startswith('_')))
        return super().write(outfile, props, parser, format_root_node, is_leaf_fn)



[docs]
    def set_species_naming_function(self, fn):
        """Set the function used to get the species from the node's name.

        :param fn: Function that takes a nodename and returns the species name.

        Example of a parsing function::

          def parse_sp_name(node_name):
              return node_name.split("_")[1]
          tree.set_species_naming_function(parse_sp_name)
        """
        for n in self.traverse():
            if fn is not None:
                n.props['_speciesFunction'] = fn
            else:
                n.props.pop('_speciesFunction', None)



[docs]
    def link_to_alignment(self, alignment, alg_format="fasta", **kwargs):
        missing_leaves = []
        missing_internal = []
        if type(alignment) == SeqGroup:
            alg = alignment
        else:
            alg = SeqGroup(alignment, format=alg_format, **kwargs)
        # sets the seq of
        for n in self.traverse():
            try:
                n.add_prop("sequence",alg.get_seq(n.name))
            except KeyError:
                if n.is_leaf:
                    missing_leaves.append(n.name)
                else:
                    missing_internal.append(n.name)
        if len(missing_leaves)>0:
            print("Warnning: [%d] terminal nodes could not be found in the alignment." %\
                len(missing_leaves), file=sys.stderr)

        # Show warning of not associated internal nodes.
        # if len(missing_internal)>0:
        #     print >>sys.stderr, \
        #       "Warnning: [%d] internal nodes could not be found in the alignment." %\
        #       len(missing_leaves)


[docs]
    def get_species(self):
        """ Returns the set of species covered by its partition. """
        return set([l.species for l in self.leaves()])



[docs]
    def iter_species(self):
        """ Returns an iterator over the species grouped by this node. """
        spcs = set([])
        for l in self.leaves():
            if l.species not in spcs:
                spcs.add(l.species)
                yield l.species



[docs]
    def get_age(self, species2age):
        """
        Implements the phylostratigrafic method described in:

        Huerta-Cepas, J., & Gabaldon, T. (2011). Assigning duplication events to
        relative temporal scales in genome-wide studies. Bioinformatics, 27(1),
        38-45.
        """
        return max([species2age[sp] for sp in self.get_species()])



[docs]
    def reconcile(self, species_tree):
        """ Returns the reconcilied topology with the provided species
        tree, and a list of evolutionary events inferred from such
        reconciliation. """
        return get_reconciled_tree(self, species_tree, [])



[docs]
    def get_my_evol_events(self, sos_thr=0.0):
        """Return list of duplication and speciation events involving this node.

        Scanned nodes are also labeled internally as dup=True|False.
        You can access these labels using ``node.dup``.

        The algorithm scans all nodes from the given leafName to the
        root. Nodes are assumed to be duplications when a species
        overlap is found between its child linages. The method is
        described in more detail in:

        :Citation:
            *The Human Phylome*. T. Genome Biol. 2007;8(6):R109.
        """
        return spoverlap.get_evol_events_from_leaf(self, sos_thr=sos_thr)



[docs]
    def get_descendant_evol_events(self, sos_thr=0.0):
        """ Returns a list of all duplication and speciation
        events detected after this node. Nodes are assumed to be
        duplications when a species overlap is found between its child
        linages. Method is described more detail in:

        "The Human Phylome." Huerta-Cepas J, Dopazo H, Dopazo J, Gabaldon
        T. Genome Biol. 2007;8(6):R109.
        """
        return spoverlap.get_evol_events_from_root(self, sos_thr=sos_thr)



[docs]
    def get_farthest_oldest_leaf(self, species2age, is_leaf_fn=None):
        """Return the farthest oldest leaf to the current one.

        It requires an species2age dictionary with the age estimation
        for all species.

        :param None is_leaf_fn: A pointer to a function that receives
            a node instance as unique argument and returns True or
            False. It can be used to dynamically collapse nodes, so
            they are seen as leaves.
        """
        root = self.root
        outgroup_dist  = 0
        outgroup_node  = self
        outgroup_age = 0 # self.get_age(species2age)

        for leaf in root.leaves(is_leaf_fn=is_leaf_fn):
            if leaf.get_age(species2age) > outgroup_age:
                outgroup_dist = leaf.get_distance(self, leaf)
                outgroup_node = leaf
                outgroup_age = species2age[leaf.get_species().pop()]
            elif leaf.get_age(species2age) == outgroup_age:
                dist = leaf.get_distance(self, leaf)
                if dist>outgroup_dist:
                    outgroup_dist  = leaf.get_distance(self, leaf)
                    outgroup_node  = leaf
                    outgroup_age = species2age[leaf.get_species().pop()]
        return outgroup_node



[docs]
    def get_farthest_oldest_node(self, species2age):
        """Return the farthest oldest node (leaf or internal).

        The difference with get_farthest_oldest_leaf() is that in this
        function internal nodes grouping seqs from the same species
        are collapsed.
        """
        # I use a custom is_leaf() function to collapse nodes groups
        # seqs from the same species
        is_leaf = lambda node: len(node.get_species())==1
        return self.get_farthest_oldest_leaf(species2age, is_leaf_fn=is_leaf)



[docs]
    def get_age_balanced_outgroup(self, species2age):
        """
        .. versionadded:: 2.2

        Returns the node better balance current tree structure
        according to the topological age of the different leaves and
        internal node sizes.

        :param species2age: A dictionary translating from leaf names
          into a topological age.

        .. warning: This is currently an experimental method!!

        """
        root = self
        all_seqs = set(self.leaf_names())
        outgroup_dist  = 0
        best_balance = max(species2age.values())
        outgroup_node  = self
        outgroup_size = 0

        for leaf in root.descendants():
            leaf_seqs = set(leaf.leaf_names())
            size = len(leaf_seqs)

            leaf_species =[self.props.get('_speciesFunction')(s) for s in leaf_seqs]
            out_species = [self.props.get('_speciesFunction')(s) for s in all_seqs-leaf_seqs]

            leaf_age_min = min([species2age[sp] for sp in leaf_species])
            out_age_min = min([species2age[sp] for sp in out_species])
            leaf_age_max = max([species2age[sp] for sp in leaf_species])
            out_age_max = max([species2age[sp] for sp in out_species])
            leaf_age = leaf_age_max - leaf_age_min
            out_age = out_age_max - out_age_min

            age_inbalance = abs(out_age - leaf_age)

            # DEBUG ONLY
            # leaf.add_features(age_inbalance = age_inbalance, age=leaf_age)

            update = False
            if age_inbalance < best_balance:
                update = True
            elif age_inbalance == best_balance:
                if size > outgroup_size:
                    update = True
                elif size == outgroup_size:
                    dist = self.get_distance(self, leaf)
                    outgroup_dist = self.get_distance(self, outgroup_node)
                    if dist > outgroup_dist:
                        update = True

            if update:
                best_balance = age_inbalance
                outgroup_node = leaf
                outgroup_size = size

        return outgroup_node



[docs]
    def get_speciation_trees(self, map_properties=None, autodetect_duplications=True,
                             newick_only=False, prop='species'):
        """Return number of species trees, of duplications, and an iterator.

        Calculates all possible species trees contained within a
        duplicated gene family tree as described in `Treeko
        <http://treeko.cgenomics.org>`_ (see `Marcet and Gabaldon,
        2011 <http://www.ncbi.nlm.nih.gov/pubmed/21335609>`_ ).

        :param map_properties: List of properties that should be
            mapped from the original gene family tree to each species
            tree subtree.
        :param autodetect_duplications: If True, duplication nodes
            will be automatically detected using the Species Overlap
            algorithm (:func:`PhyloTree.get_descendants_evol_events`).
            If False, duplication nodes within the original tree are
            expected to contain the property "evoltype='D'".
        """
        t = self
        if autodetect_duplications:
            n2content = t.get_cached_content()
            n2species = t.get_cached_content(prop)
            for node in n2content:
                sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children])
                if len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal:
                    node.props['evoltype'] = 'D'

        sp_trees = get_subtrees(t, properties=map_properties, newick_only=newick_only)

        return sp_trees


    def __get_speciation_trees_recursive(self):
        # NOTE: This function is experimental and for testing.
        t = self.copy()
        if autodetect_duplications:
            dups = 0
            n2content = t.get_cached_content()
            n2species = t.get_cached_content('species')

            #print "Detecting dups"
            for node in n2content:
                sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children])
                if  len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal:
                    node.props['evoltype'] = 'D'
                    dups += 1
                elif node.is_leaf:
                    node._leaf = True
            #print dups
        else:
            for node in t.leaves():
                node._leaf = True
        subtrees = _get_subtrees_recursive(t)
        return len(subtrees), 0, subtrees


[docs]
    def split_by_dups(self, autodetect_duplications=True):
        """Return the list of subtrees when splitting by its duplication nodes.

        :param True autodetect_duplications: If True, duplication
            nodes will be automatically detected using the Species
            Overlap algorithm
            (:func:`PhyloTree.get_descendants_evol_events`. If False,
            duplication nodes within the original tree are expected to
            contain the feature "evoltype=D".
        """
        try:
            t = self.copy()
        except Exception:
            t = self.copy("deepcopy")

        if autodetect_duplications:
            dups = 0
            n2content = t.get_cached_content()
            n2species = t.get_cached_content('species')

            #print "Detecting dups"
            for node in n2content:
                sp_subtotal = sum([len(n2species[_ch]) for _ch in node.children])
                if  len(n2species[node]) > 1 and len(n2species[node]) != sp_subtotal:
                    node.props['evoltype'] = 'D'
                    dups += 1
                elif node.is_leaf:
                    node._leaf = True
        else:
            for node in t.leaves():
                node._leaf = True
        sp_trees = get_subparts(t)
        return sp_trees



[docs]
    def collapse_lineage_specific_expansions(self, species=None, return_copy=True):
        """ Converts lineage specific expansion nodes into a single
        tip node (randomly chosen from tips within the expansion).

        :param None species: If supplied, only expansions matching the
           species criteria will be pruned. When None, all expansions
           within the tree will be processed.

        """
        if species and isinstance(species, (list, tuple)):
            species = set(species)
        elif species and (not isinstance(species, (set, frozenset))):
            raise TypeError("species argument should be a set (preferred), list or tuple")

        prunned = self.copy("deepcopy") if return_copy else self
        n2sp = prunned.get_cached_content('species')
        n2leaves = prunned.get_cached_content()
        is_expansion = lambda n: (len(n2sp[n])==1 and len(n2leaves[n])>1
                                  and (species is None or species & n2sp[n]))
        for n in prunned.leaves(is_leaf_fn=is_expansion):
            repre = list(n2leaves[n])[0]
            repre.detach()
            if n is not prunned:
                n.up.add_child(repre)
                n.detach()
            else:
                return repre

        return prunned




[docs]
    def annotate_ncbi_taxa(self, taxid_attr='species', tax2name=None, tax2track=None, tax2rank=None, dbfile=None, ignore_unclassified=False):
        """Add NCBI taxonomy annotation to all descendant nodes. Leaf nodes are
        expected to contain a feature (name, by default) encoding a valid taxid
        number.

        All descendant nodes (including internal nodes) are annotated with the
        following new features:

        `Node.spname`: scientific spcies name as encoded in the NCBI taxonomy database

        `Node.named_lineage`: the NCBI lineage track using scientific names

        `Node.taxid`: NCBI taxid number

        `Node.lineage`: same as named_lineage but using taxid codes.


        Note that for internal nodes, NCBI information will refer to the first
        common lineage of the grouped species.

        :param name taxid_attr: the name of the feature that should be used to access the taxid number associated to each node.

        :param None tax2name: A dictionary where keys are taxid
            numbers and values are their translation into NCBI
            scientific name. Its use is optional and allows to avoid
            database queries when annotating many trees containing the
            same set of taxids.

        :param None tax2track: A dictionary where keys are taxid
            numbers and values are their translation into NCBI lineage
            tracks (taxids). Its use is optional and allows to avoid
            database queries when annotating many trees containing the
            same set of taxids.

        :param None tax2rank: A dictionary where keys are taxid
            numbers and values are their translation into NCBI rank
            name. Its use is optional and allows to avoid database
            queries when annotating many trees containing the same set
            of taxids.

        :param None dbfile : If provided, the provided file will be
            used as a local copy of the NCBI taxonomy database.

        :returns: tax2name (a dictionary translating taxid numbers
            into scientific name), tax2lineage (a dictionary
            translating taxid numbers into their corresponding NCBI
            lineage track) and tax2rank (a dictionary translating
            taxid numbers into rank names).

        """

        ncbi = NCBITaxa(dbfile=dbfile)
        return ncbi.annotate_tree(self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank, ignore_unclassified=ignore_unclassified)



[docs]
    def annotate_gtdb_taxa(self, taxid_attr='species', tax2name=None, tax2track=None, tax2rank=None, dbfile=None, ignore_unclassified=False):
        gtdb = GTDBTaxa(dbfile=dbfile)
        return gtdb.annotate_tree(self, taxid_attr=taxid_attr, tax2name=tax2name, tax2track=tax2track, tax2rank=tax2rank, ignore_unclassified=ignore_unclassified)



[docs]
    def ncbi_compare(self, autodetect_duplications=True, cached_content=None):
        if not cached_content:
            cached_content = self.get_cached_content()
        cached_species = set([n.props.get('species') for n in cached_content[self]])

        if len(cached_species) != len(cached_content[self]):
            ntrees, ndups, target_trees = self.get_speciation_trees(
                    autodetect_duplications=autodetect_duplications,
                    map_properties=["taxid"])
        else:
            target_trees = [self]


        ncbi = NCBITaxa()
        for t in target_trees:
            ncbi.get_broken_branches(t, cached_content)
Source code for ete4.phylo.phylotree

ETE Toolkit

Navigation

Related Topics