Source code for dockerasmus.pdb.protein

# coding: utf-8
from __future__ import absolute_import
from __future__ import unicode_literals
from __future__ import division

import collections
import six
import copy
import gzip
import functools
import numpy

from ..utils import iterators
from .chain import Chain
from .residue import Residue
from .atom import Atom


[docs]class Protein(collections.OrderedDict):
    __slots__ = ("id", "name", "_atom_charges")

    _CMAP_MODES = {
        'mass_center': lambda r1,r2: r1.distance_to(r2.mass_center),
        'nearest': lambda r1, r2: min(a1.distance_to(a2.pos)
                for a1 in r1.itervalues() for a2 in r2.itervalues()),
        'farthest': lambda r1, r2: max(a1.distance_to(a2.pos)
                for a1 in r1.itervalues() for a2 in r2.itervalues()),
    }

    @staticmethod
    def _parse_pdb_atom_line(line):
        """Return a raw `dict` with atom properties from a pdb atom line.

        Returns:
            `dict`: a dictionary which keys are: ``serial``, ``name``,
                ``chainID``, ``altLoc``, ``resName``, ``resSeq``,
                ``iCode``, ``x``, ``y`` and ``z``.
        """
        schema = {'serial': (6, 11), 'name': (12, 16), 'altLoc': (16, 17),
                  'resName': (17, 20), 'chainID': (21, 22), 'resSeq': (22, 26),
                  'iCode': (26, 27), 'x': (30, 38), 'y': (38, 46), 'z': (46, 54)}
        # Decode a binary string to a unicode/str object
        decode = lambda s: s.decode('utf-8')
        # callback to be called after the value field  is isolated from the line,
        # either to transtype or to decode a binary string
        callbacks = {'serial': int, 'name': decode, 'altLoc': decode,
                    'resName': decode, 'chainID': decode, 'resSeq': int,
                    'iCode': decode, 'x': float, 'y': float, 'z': float}
        return {key: callbacks.get(key)(line[i:j].strip())
                for key,(i,j) in schema.items()}

    @classmethod
[docs]    def from_pdb(cls, handle):
        """Create a new Protein object from a PDB file handle.

        Arguments:
            handle (file handle): a file-like object opened in
                binary read mode (must be line-by-line iterable).
        """

        protein = cls()
        for line in handle:
            if line.startswith(b"ATOM  "):

                atom = cls._parse_pdb_atom_line(line)

                if atom['chainID'] not in protein:
                    protein[atom['chainID']] = Chain(atom['chainID'])

                if atom['resSeq'] not in protein[atom['chainID']]:
                    protein[atom['chainID']][atom['resSeq']] = Residue(atom['resSeq'], atom['resName'])

                protein[atom['chainID']][atom['resSeq']][atom['name']] = Atom(
                    atom['x'], atom['y'], atom['z'], atom['serial'], atom['name'],
                    protein[atom['chainID']][atom['resSeq']],
                )
        return protein

    @classmethod
[docs]    def from_pdb_file(cls, path):
        """Create a new Protein object from a PDB file.

        Arguments:
            path (`str`): the path to a PDB protein file (supports gzipped
                or plain text PDB files).
        """
        if path.endswith('.gz'):
            open_function = gzip.open
        else:
            open_function = functools.partial(open, mode='rb')
        with open_function(path) as pdb_file:
            return cls.from_pdb(pdb_file)

    def __init__(self, id=None, name=None, chains=None):
        """Create a new Protein object.

        Arguments:
            id (`int`): the id of the protein.
            name (`str`): the name of the protein.
            chains (`dict` of `Chain`): a dictionary of the chains
                of the proteins referenced by their ``id``.
        """
        super(Protein, self).__init__(chains or {})
        self.id = id
        self.name = name

        # Memoize matrices and vectors
        self._atom_charges = None
        self._atom_pwd = None
        self._atom_positions = None
        self._atom_radius = None

    def __add__(self, other):
        """Return a new Protein complexed with ``other``.

        Example:
            >>> sorted(barnase.keys())
            [u'B']
            >>> complex = barnase + barstar
            >>> sorted(complex.keys())
            [u'B', u'D']

        Raises:
            ValueError: when there's an ID collision with
                the chain or the protein
            TypeError: when ``other`` is neither a `Chain`
                nor a `Protein`.
        """
        return self.copy().__iadd__(other)

    def __iadd__(self, other):
        """Complex the Protein with ``other``.

        Example:
            >>> prot = Protein()
            >>> prot += barnase
            >>> sorted(prot.keys())
            [u'B']
            >>> prot += barstar
            >>> sorted(prot.keys())
            [u'B', u'D']

        Raises:
            ValueError: when there's an ID collision with
                the chain or the protein
            TypeError: when ``other`` is neither a `Chain`
                nor a `Protein`.
        """
        if isinstance(other, Chain):
            if other.id in self:
                raise ValueError(
                    'Protein already contains a chain'
                    'with id {} !'.format(other.id)
                )
            self[other.id] = other
        elif isinstance(other, Protein):
            if not set(self.keys()).isdisjoint(other.keys()):
                common_keys = set(self.keys()).intersection(other.keys())
                raise ValueError(
                    'Protein already contains a chain with id'
                    ' {} !'.format(', '.join(common_keys))
                )
            self.update(other)
        else:
            raise TypeError(
                "unsupported operand type(s) for +: 'Protein'"
                " and '{}'".format(type(other).__name__)
            )
        return self


    def __contains__(self, item):
        if isinstance(item, six.text_type):
            return super(Protein, self).__contains__(item)
        elif isinstance(item, (Residue, Atom)):
            return any(item in chain for chain in self.itervalues())
        elif isinstance(item, Chain):
            return super(Protein, self).__contains__(item.id)
        else:
            raise TypeError(
                "'in <Protein>' requires Chain, Residue, Atom or unicode"
                " as left operand, not {}".format(type(item).__name__)
            )

    def __getitem__(self, item):
        """Overloaded __getitem__ allowing slicing and individual atom access.

        Example:
            >>> complex = Protein.from_pdb_file("tests/data/1brs.pdb.gz")
            >>> barstar = complex[u'D':]
            >>> sorted(barstar.keys())
            [u'D', u'E', u'F']
        """
        if isinstance(item, slice):
            stop = item.stop or iterators.nth(iterators.wordrange(max(self.keys())), 1)
            start = item.start or min(self.keys())
            return Protein(chains=collections.OrderedDict([
                (k, super(Protein, self).__getitem__(k))
                    for k in iterators.wordrange(start, stop)
            ]))
        else:
            return super(Protein, self).__getitem__(item)

    @property
    def mass(self):
        """The mass of the protein.

        Warning:
            Computed as the sum of the masses of the residues
            of the chain (it does not take the masses of the atoms
            in the peptidic bound into account).
        """
        return sum(chain.mass for chain in self.itervalues())

    @property
    def mass_center(self):
        r"""The position of mass center of the protein.

        .. math::

           mc &= \sum_{i}{\frac{w_i}{W}
             \begin{pmatrix} x_i \\ y_i \\ z_i \end{pmatrix}
           }

        where :math:`i` is the index of each atom, :math:`x_i`
        (resp. :math:`y_i`) (resp. :math:`z_i`) if the abcissa
        (resp. ordinate) (resp. height) of the atom :math:`i` in the
        worldspace, and :math:`W = \sum_i{w_i}` the approximated mass
        of the whole protein.

        Warning:
            Uses `Protein.mass`, so only the atoms on the residues
            of each aminoacid are used for the computation.
        """
        mass = self.mass
        return sum(
            (atom.mass/mass)*atom.pos for chain in self.itervalues()
            for res in chain.itervalues() for atom in res.itervalues()
        )

    @property
    def radius(self):
        """The radius of the sphere the protein would fit in.

        Equals to the norm of the position of the atom of the protein farthest
        from its mass center.
        """
        origin = self.mass_center
        return max(
            atom.distance_to(origin)
                for chain in self.itervalues()
                    for residue in chain.itervalues()
                        for atom in residue.itervalues()
        )

[docs]    def atom_charges(self):
        """The vector of the charge of each atom of the protein.
        """
        if self._atom_charges is None:
            self._atom_charges = numpy.array([
                a.charge for a in self.iteratoms()
            ])
        return self._atom_charges

[docs]    def atom_pwd(self):
        """The vector of the potential well depth of each atom of the protein.
        """
        if self._atom_pwd is None:
            self._atom_pwd = numpy.array([
                a.pwd for a in self.iteratoms()
            ])
        return self._atom_pwd

[docs]    def atom_positions(self):
        """The matrix of the positions of each atom of the protein.
        """
        if self._atom_positions is None:
            self._atom_positions = numpy.array([
                a.pos for a in self.iteratoms()
            ])
        return self._atom_positions

[docs]    def atom_radius(self):
        """The vector of the Van der Waals radius of each atom of the protein.
        """
        if self._atom_radius is None:
            self._atom_radius = numpy.array([
                a.radius for a in self.iteratoms()
            ])
        return self._atom_radius

[docs]    def contact_map(self, other, mode='nearest'):
        """Return a 2D contact map between residues of ``self`` and ``other``.

        Arguments:
            other (`Protein`): the other protein with which to create
                a contact map (chains/residues/atoms must have the same
                names in both proteins)

        Keyword Arguments:
            mode (`str`): how to compute the contact map. Available modes are:
              ``'nearest'`` (the distance between the two closest atoms of
              the two residues), ``'farthest'`` (the distance between the two
              farthest atoms of the two residues) or ``'mass_center'``
              (the distance between the mass center of the two residues).
        """
        if mode not in self._CMAP_MODES:
            raise ValueError("Unknown mode: '{}'".format(mode))
        if not isinstance(other, Protein):
            raise TypeError("other must be a Protein,"
                            " not {}".format(type(other).__name__))

        dim_x = max(r.id for chain in self.itervalues() for r in chain.itervalues())
        dim_y = max(r.id for chain in other.itervalues() for r in chain.itervalues())

        cmap = numpy.zeros((dim_x+1, dim_y+1))

        for c in self.itervalues():
            for r in c.itervalues():
                for other_c in other.itervalues():
                    for other_r in other_c.itervalues():
                        cmap[r.id, other_r.id] = self._CMAP_MODES[mode](r, other_r)

        return cmap

[docs]    def atom(self, atom_id):
        """Get atom of ``self`` with id ``atom_id``.

        Raises:
            KeyError: when no Atom has the given id.
        """
        try:
            atom_id = int(atom_id)
        except ValueError:
            six.raise_from(
                TypeError("Invalid type: {}".format(type(atom_id).__name__)),
                None
            )

        atom_id = int(atom_id)
        atom = next((atom for atom in self.iteratoms() if atom.id==atom_id), None)
        if atom is None:
            raise KeyError("Could not find Atom with id: {}".format(atom_id))
        return atom

[docs]    def residue(self, res_id):
        """Get residue of ``self`` with id ``res_id``.

        Raises:
            KeyError: when no Residue has the given id.
        """
        try:
            res_id = int(res_id)
        except ValueError:
            six.raise_from(
                TypeError("Invalid type: {}".format(type(res_id).__name__)),
                None
            )

        res = next((res for chain in self.itervalues()
                    for res in chain.itervalues() if res.id==res_id), None,
        )
        if res is None:
            raise KeyError("Could not find Residue with id: {}".format(res_id))
        return res

[docs]    def copy(self):
        """Return a deep copy of ``self``.
        """
        return Protein(self.id, self.name, collections.OrderedDict([
            (chain.id, Chain(chain.id, chain.name, collections.OrderedDict([
                (residue.id, copy.deepcopy(residue))
                    for residue in chain.itervalues()
            ])))
                for chain in self.itervalues()
        ]))

[docs]    def iteratoms(self):
        """Yield every atom in ``self``.

        Yields:
            `Atom`: every atom of the protein, ordered by
            the id of their chain and the id of their residue.
        """
        for chain in self.itervalues():
            for residue in chain.itervalues():
                for atom in sorted(residue.itervalues(), key=lambda a: a.id):
                    yield atom

[docs]    def interface(self, other, distance=4.5):
        """Return couples of residues of ``self`` interfacing with ``other``.

        Yields:
            tuple - a (`Residue`, `Residue`) couple where the first element
                is a residue of `self` and the second a residue of `other`
                only if two or more of their respective aminoacids are closer
                than ``distance``.
        """
        if not isinstance(other, Protein):
            raise TypeError("Invalid type: {}".format(type(other).__name__))

        # Reuse the distance matrix computation of dockerasmus.score !
        from ..score.requirements import distance as dist
        mx_distance = dist(self, other)

        # Atoms of self and other in the same order as
        # self.atom_positions() and other.atom_position
        atoms_self = list(self.iteratoms())
        atoms_other = list(other.iteratoms())

        # Start the iterator
        done = set()
        for i in range(mx_distance.shape[0]):
            for j in range(mx_distance.shape[1]):
                if mx_distance[i,j] < distance:
                    res_self, res_other = atoms_self[i].residue, atoms_other[j].residue
                    if (res_self, res_other) not in done:
                        done.add((res_self, res_other))
                        yield res_self, res_other


[docs]    def nearest_atom(self, pos):
        """Return the atom nearest to the position ``pos``.
        """
        return min(
            (atom for chain in self.values() for res in chain.values() for atom in res.values()),
            key = lambda a: a.distance_to(pos)
        )

[docs]    def rmsd(self, other):
        """
        """
        #TODO: matricial computation
        rmsd, length = 0, 0
        if isinstance(other, Protein):
            # Compute pairwise RMSD
            positions_other = other.atom_positions()
            positions_self = self.atom_positions()
            length = len(positions_self)
            if length != len(positions_other):
                raise ValueError(
                "'other' does not have the same number of atoms !"
                )
            rmsd = numpy.sum((positions_self - positions_other)**2)
        elif isinstance(other, (list, numpy.ndarray)):
            # Compute reference-wise RMSD
            if len(other) != 3:
                raise ValueError(
                    "Invalid reference position dimension: {}".format(len(other))
                )
            vec_distance_squared = (self.atom_positions() - other)**2
            rmsd = numpy.sum(vec_distance_squared)
            length = len(vec_distance_squared)
        else:
            raise TypeError("other must be Protein, list or numpy.ndarray,"
                            " not {}".format(type(other).__name__))
        return (rmsd/length)**.5

    if six.PY3:
        def itervalues(self):
            return six.itervalues(self)

        def iteritems(self):
            return six.iteritems(self)