Source code for molSimplify.Classes.protein3D

#  @file protein3D.py
#  Defines protein3D class and contains useful manipulation/retrieval routines.
#
#  Written by HJK Group
#
#  Dpt of Chemical Engineering, MIT

# imports
from molSimplify.Classes.monomer3D import monomer3D
from molSimplify.Classes.mol3D import mol3D
from molSimplify.Classes.atom3D import atom3D
from molSimplify.Classes.helpers import read_atom, makeMol
from molSimplify.Classes.globalvars import globalvars
import urllib.request
import urllib.error
import requests
from bs4 import BeautifulSoup
import pandas as pd
import subprocess
import shlex
import ast
import time
from scipy.spatial import ConvexHull
# from pymol import cmd, stored
# no GUI support for now


[docs]class protein3D:
    """Holds information about a protein, used to do manipulations.  Reads
    information from structure file (pdb, cif) or is directly built from
    molsimplify.

    """

    def __init__(self, pdbCode='undef'):
        # Number of monomers
        self.naas = 0
        # Number of heteromolecules
        self.nhetmols = 0
        # Number of chains
        self.nchains = 0
        # Dictionary of monomers
        self.aas = {}
        # Dictionary of all atoms
        self.atoms = {}
        # Dictionary of all atom indices
        self.a_ids = {}
        # Dictionary of heteromolecules
        self.hetmols = {}
        # Dictionary of chains
        self.chains = {}
        # Dictionary of missing atoms
        self.missing_atoms = {}
        # List of missing monomers
        self.missing_aas = []
        # List of chain locations with more than one conformation
        self.conf = []
        # R value
        self.R = -1
        # Rfree value
        self.Rfree = -1
        # PDB code
        self.pdbCode = pdbCode
        # Holder for metals
        self.metals = False
        # Bonds
        self.bonds = {}
        # Data completeness
        self.DataCompleteness = 0
        # RSRZ value
        self.RSRZ = 100
        # TwinL score
        self.TwinL = 0
        # TwinL^2 score
        self.TwinL2 = 0
        # center of mass
        self.com = []
        # centroid
        self.centroid = []
        # convex hull
        self.hull = []

[docs]    def setAAs(self, aas):
        """
        Set monomers of a protein3D class to different monomers.

        Parameters
        ----------
        aas : dictionary
            Keyed by chain and location
            Valued by monomer3D monomers (amino acids or nucleotides)
        """
        self.aas = aas
        self.naas = len(aas)

[docs]    def setAtoms(self, atoms):
        """
        Set atom indices of a protein3D class to atoms.

        Parameters
        ----------
        atoms : dictionary
            Keyed by atom index
            Valued by atom3D atom that has that index
        """
        self.atoms = atoms

[docs]    def setIndices(self, a_ids):
        """ Set atom indices of a protein3D class to atoms.

        Parameters
        ----------
        a_ids : dictionary
            Keyed by atom3D atom
            Valued by its index
        """
        self.a_ids = a_ids

[docs]    def setHetmols(self, hetmols):
        """
        Set heteromolecules of a protein3D class to different ones.

        Parameters
        ----------
        hetmols : dictionary
            Keyed by chain and location
            Valued by mol3D heteromolecules
        """
        self.hetmols = hetmols
        self.nhetmols = len(hetmols.keys())

[docs]    def setChains(self, chains):
        """
        Set chains of a protein3D class to different chains.

        Parameters
        ----------
        chains : dictionary
            Keyed by desired chain IDs.
            Valued by the list of molecules in the chain.
        """
        self.chains = chains
        self.nchains = len(chains.keys())

[docs]    def setMissingAtoms(self, missing_atoms):
        """
        Set missing atoms of a protein3D class to a new dictionary.

        Parameters
        ----------
        missing_atoms : dictionary
            Keyed by amino acid residues / nucleotides of origin
            Valued by missing atoms
        """
        self.missing_atoms = missing_atoms

[docs]    def setMissingAAs(self, missing_aas):
        """
        Set missing amino acids of a protein3D class to a new list.

        Parameters
        ----------
        missing_aas : list
            List of missing amino acids.
        """
        self.missing_aas = missing_aas

[docs]    def setConf(self, conf):
        """
        Set possible conformations of a protein3D class to a new list.

        Parameters
        ----------
        conf : list
            List of possible conformations for applicable amino acids.
        """
        self.conf = conf

[docs]    def autoChooseConf(self):
        """
        Automatically choose the conformation of a protein3D class
        instance based first on what the greatest occupancy level is and then
        the first conformation ihe alphabet with all else equal.

        """
        for c in self.conf:
            c_ids = []
            if c in self.aas.keys():
                lst = self.aas[c]
            else:
                lst = self.hetmols[c]
            if len(lst) == 1:
                self.chains[c[0]].insert(c[1]-1, lst[0])
            else:
                for li in lst:
                    if li not in self.chains[c[0]]:
                        for j in li.atoms:
                            in_more_confs = False
                            for m in lst:
                                if m != li and j in m.atoms:
                                    in_more_confs = True
                            if type(j) != atom3D and not in_more_confs:
                                c_ids.append(j[0])
                            elif not in_more_confs:
                                c_ids.append(self.getIndex(j))
                        # print(c_ids)
                        self.stripAtoms(c_ids)
                        if type(li) == monomer3D and li in self.aas[c]:
                            self.aas[c].remove(li)
                        elif type(li) == mol3D and li in self.hetmols[c]:
                            self.hetmols[c].remove(li)
        self.setConf([])

[docs]    def setR(self, R):
        """
        Set R value of protein3D class.

        Parameters
        ----------
        R : float
            The desired new R value.
        """
        self.R = R

[docs]    def setRfree(self, Rfree):
        """
        Set Rfree value of protein3D class.

        Parameters
        ----------
        Rfree : float
            The desired new Rfree value.
        """
        self.Rfree = Rfree

[docs]    def setRSRZ(self, RSRZ):
        """
        Set RSRZ score of protein3D class.

        Parameters
        ----------
        RSRZ : float
            The desired new RSRZ score.
        """
        self.RSRZ = RSRZ

[docs]    def getMissingAtoms(self):
        """
        Get missing atoms of a protein3D class.

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('1MH1') # Fetch a PDB
        fetched: 1MH1
        >>> missing_atoms = pdb_system.getMissingAtoms()

        List atoms in the first set of missing_atoms
        >>> [atom.sym for atom in list(missing_atoms)[0]]
        ['C', 'C', 'C', 'C', 'C', 'C', 'O']
        """
        return self.missing_atoms.values()

[docs]    def getMissingAAs(self):
        """
        Get missing amino acid residues of a protein3D class.

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('1MH1') # Fetch a PDB
        fetched: 1MH1
        >>> pdb_system.getMissingAAs()   # This gives a list of monomer3D objects
        [monomer3D(VAL, id=182), monomer3D(LYS, id=183), monomer3D(LYS, id=184)]
        """
        return self.missing_aas

[docs]    def countAAs(self):
        """
        Return the number of amino acid residues in a protein3D class.

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('1os7') # Fetch a PDB
        fetched: 1os7
        >>> pdb_system.countAAs() # This return the number of AAs in the PDB for all the chains.
        1121
        """
        return self.naas

[docs]    def findAtom(self, sym="X", aa=True):
        """
        Find atoms with a specific symbol that are contained in amino acids
        or heteromolecules.

        Parameters
        ----------
        sym : str
            element symbol, default as X.
        aa : boolean
            True if we want atoms contained in amino acids
            False if we want atoms contained in heteromolecules

        Returns
        -------
        inds: list
            a list of atom indices with the specified symbol.

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('1os7') # Fetch a PDB
        fetched: 1os7
        >>> pdb_system.findAtom(sym="S", aa=True) # Returns indices of sulphur atoms present in amino acids
        [2166, 4442, 6733, 9041]
        >>> pdb_system.findAtom(sym="S", aa=False) # Returns indices of sulphur atoms present in heteromolecules
        [9164, 9182, 9200]
        """
        inds = []
        if aa:
            mols = self.aas.values()
        else:
            mols = self.hetmols.values()
        for s in mols:
            for m in s:
                for a in m.atoms:
                    if type(a) == tuple:
                        ii = a[0]
                        a = a[1]
                    else:
                        ii = self.getIndex(a)
                    if a.symbol() == sym:
                        inds.append(ii)
        return inds

[docs]    def findAA(self, three_lc="XAA"):
        """
        Find amino acids with a specific three-letter code.

        Parameters
        ----------
        three_lc: str
            three-letter code, default as XAA.

        Returns
        -------
        inds: set
            a set of amino acid indices with the specified symbol.

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('1os7') # Fetch a PDB
        fetched: 1os7

        Return a set of pairs where each pair is a combination of the chain name and
        the index of the amino acid specified (in this case, 'MET')
        >>> aa_set = pdb_system.findAA(three_lc = 'MET')
        >>> sorted(aa_set)  # Sorting for reproducible order in doctest
        [('A', 268), ('B', 268), ('C', 268), ('D', 268)]
        """
        inds = set()
        for aa in self.aas.values():
            if aa[0].three_lc == three_lc:
                inds.add((aa[0].chain, aa[0].id))
        return inds

[docs]    def getChain(self, chain_id):
        """
        Takes a chain of interest and turns it into its own protein3D class instance.

        Parameters
        ----------
        chain_id : string
            The letter name of the chain of interest

        Returns
        -------
        p : protein3D
            A protein3D instance consisting of just the chain of interest

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('1os7') # Fetch a PDB
        fetched: 1os7
        >>> pdb_system.getChain('A') # doctest: +SKIP
        """
        p = protein3D()
        p.setPDBCode(self.pdbCode)
        p.setChains({chain_id: self.chains[chain_id]})
        p.setR(self.R)
        p.setRfree(self.Rfree)

        missing_aas = []
        for aa in self.missing_aas:
            if aa.chain == chain_id:
                missing_aas.append(aa)
        p.setMissingAAs(missing_aas)

        aas = {}
        for aa in self.aas:
            if aa[0] == chain_id:
                aas[aa] = self.aas[aa]
        p.setAAs(aas)

        gone_atoms = {}
        for aa in self.missing_atoms.keys():
            if aa[0] == chain_id:
                gone_atoms[aa] = self.missing_atoms[aa]
        p.setMissingAtoms(gone_atoms)

        hets_flipped = {value[0]: key for key, value in self.hetmols.items()}
        atoms = {}
        a_ids = {}
        hets = {}
        for a_id in self.atoms:
            aa = self.getMolecule(a_id)

            if type(aa) == monomer3D:
                if aa.chain == chain_id:
                    atoms[a_id] = self.atoms[a_id]
                    a_ids[self.atoms[a_id]] = a_id
            else:
                if aa not in hets_flipped:
                    print(a_id)
                het = hets_flipped[aa]
                het_chain_id = het[0]
                if het_chain_id == chain_id:
                    hets[het] = self.hetmols[het]
                    atoms[a_id] = self.atoms[a_id]
                    a_ids[self.atoms[a_id]] = a_id

        p.setHetmols(hets)
        p.setAtoms(atoms)

        bonds = {}
        for a in self.bonds.keys():
            if a in p.atoms.values():
                bonds[a] = set()
                for b in self.bonds[a]:
                    if b in p.atoms.values():
                        bonds[a].add(b)
        p.setBonds(bonds)

        p.setIndices(a_ids)
        p.setConf([conf for conf in self.conf if conf[0] == chain_id])

        return p

[docs]    def getMolecule(self, a_id, aas_only=False):
        """
        Finds the molecule that the atom is contained in.

        Parameters
        ----------
        a_id : int
            The index of the desired atom whose molecule we want to find
        aas_only : boolean
            True if we want ito find atoms contained in amino acids only.
            False if we want atoms contained in all molecules. Default is False.

        Returns
        -------
        mol : monomer3D or mol3D
            The amino acid residue, nucleotide, or heteromolecule containing the atom

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('1os7') # Fetch a PDB
        fetched: 1os7

        This returns an molSimplify.Classes.monomer3D object indicating that the atom is part of an amino acid or nucleotide:
        >>> pdb_system.getMolecule(a_id=2166)
        monomer3D(MET, id=268)

        This returns a mol3D object indicating that the atom is part of a molecule that is not an amino acid or nucleotide
        >>> pdb_system.getMolecule(a_id=9164)
        mol3D(S1O3N1C2)
        >>> pdb_system.getMolecule(a_id=9164).name # This prints the name of the molecule, in this case, it is 'TAU'
        'TAU'
        """
        for s in self.aas.values():
            for mol in s:  # mol is monomer3D
                if (a_id, self.atoms[a_id]) in mol.atoms:
                    return mol
        for mol in self.missing_atoms.keys():  # mol is incomplete monomer3D
            if (a_id, self.atoms[a_id]) in self.missing_atoms[mol]:
                return mol
        if not aas_only:
            for s in self.hetmols.values():
                for mol in s:  # mol is mol3D
                    if self.atoms[a_id] in mol.atoms:
                        return mol
        return None  # something is wrong

[docs]    def stripAtoms(self, atoms_stripped):
        """
        Removes certain atoms from the protein3D class instance.

        Parameters
        ----------
        atoms_stripped : list
            List of atom3D indices that should be removed

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('1os7') # Fetch a PDB
        fetched: 1os7
        >>> pdb_system.stripAtoms([2166, 4442, 6733, 2165]) # This removes the list of atoms with
        >>>                                                # indices listedin the code
        """
        atoms = self.atoms
        a_ids = self.a_ids
        keys = list(self.aas.keys()) + list(self.hetmols.keys())
        for tup in keys:
            if tup in self.aas.keys():
                mol_set = self.aas[tup].copy()
            else:
                mol_set = self.hetmols[tup].copy()
            for elt in mol_set:
                for a in elt.atoms:
                    if type(a) != atom3D:
                        atom = a[1]
                    else:
                        atom = a
                    if atom not in self.a_ids.keys():
                        continue
                    a_id = self.getIndex(atom)
                    if a_id in atoms_stripped:
                        if (a_id, atom) in elt.atoms:
                            elt.atoms.remove((a_id, atom))
                            if atom in elt.c:
                                elt.c.remove(atom)
                            elif atom in elt.n:
                                elt.n.remove(atom)
                        elif atom in elt.atoms:
                            elt.atoms.remove(atom)
                        atoms_stripped.remove(a_id)
                        if atom in self.bonds.keys():
                            for at in self.bonds[atom]:
                                if at in self.bonds.keys():
                                    temp = self.bonds[at].copy()
                                    if atom in temp:
                                        temp.remove(atom)
                                    self.bonds[at] = temp
                            del self.bonds[atom]
                        del atoms[a_id]
                        del a_ids[atom]
                if len(elt.atoms) == 0:
                    if tup in self.aas.keys():
                        self.aas[tup].remove(elt)
                        if len(self.aas[tup]) == 0:
                            del self.aas[tup]
                    else:
                        self.hetmols[tup].remove(elt)
                        if len(self.hetmols[tup]) == 0:
                            del self.hetmols[tup]
        while len(atoms_stripped) != 0:
            a_id = atoms_stripped[0]
            atoms_stripped.pop(0)
            if a_id not in atoms.keys():
                continue
            atom = atoms[a_id]
            if atom in self.bonds.keys():
                for at in self.bonds[atom]:
                    temp = self.bonds[at].copy()
                    if atom in temp:
                        temp.remove(atom)
                    self.bonds[at] = temp
                del self.bonds[atom]
            del atoms[a_id]
            del a_ids[atom]
        self.setAtoms(atoms)
        self.setIndices(a_ids)

[docs]    def stripHetMol(self, hetmol):
        """
        Removes all heteroatoms part of the specified heteromolecule from
            the protein3D class instance.

        Parameters
        ----------
        hetmol : str
            String representing the name of a heteromolecule whose
            heteroatoms should be stripped from the protein3D class instance

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('3I40') # Fetch a PDB
        fetched: 3I40
        >>> pdb_system.stripHetMol('HOH')
        """
        hets = self.hetmols.copy()
        for k in hets.keys():
            if k not in self.hetmols.keys():
                continue
            for m in hets[k]:
                if m.name == hetmol:
                    ids = []
                    for a in m.atoms:
                        ids.append(self.a_ids[a])
                    self.stripAtoms(ids)
                    try:  # RM 2023/04/22: I don't think this is necessary as stripAtoms takes care of deleting the hetmol
                        del self.hetmols[k]
                    except KeyError:
                        pass

[docs]    def findMetal(self, transition_metals_only=True):
        """
        Find metal(s) in a protein3D class.

        Parameters
        ----------
        transition_metals_only : bool, optional
            Only find transition metals. Default is true.

        Returns
        -------
        metal_list : list
            List of indices of metal atoms in protein3D.

        Examples
        --------
        >>> pdb_system = protein3D()
        >>> pdb_system.fetch_pdb('1os7')
        fetched: 1os7
        >>> pdb_system.findMetal()
        [9160, 9178, 9196, 9214]
        """
        if not self.metals:
            metal_list = []
            for li in self.hetmols.values():  # no metals in AAs
                for m in li:
                    for a in m.atoms:
                        if a.ismetal(transition_metals_only=transition_metals_only):
                            if a.occup == 1 or a in self.bonds.keys():
                                metal_list.append(self.getIndex(a))
            self.metals = metal_list
        return (self.metals)

[docs]    def freezeatom(self, atomIdx):
        """
        Set the freeze attribute to be true for a given atom3D class.

        Parameters
        ----------
        atomIdx : int
            Index for atom to be frozen.
        """

        self.atoms[atomIdx].frozen = True

[docs]    def freezeatoms(self, Alist):
        """
        Set the freeze attribute to be true for a given set of atom3D classes,
        given their indices. Preserves ordering, starts from largest index.

        Parameters
        ----------
        Alist : list
            List of indices for atom3D instances to remove.
        """

        for h in sorted(Alist, reverse=True):
            self.freezeatom(h)

[docs]    def getAtom(self, idx):
        """
        Get atom with a given index.

        Parameters
        ----------
        idx : int
            Index of desired atom.

        Returns
        -------
        atom : atom3D
            atom3D class for element at given index.

        """
        return self.atoms[idx]

[docs]    def getIndex(self, atom):
        """
        Get index of a given atom

        Parameters
        ----------
        atom : atom3D
            atom3D class for element at given index.

        Returns
        -------
        idx : int
            Index of desired atom.

        """
        if hasattr(self, 'a_ids') and atom in self.a_ids.keys():
            idx = self.a_ids[atom]
        else:
            idx = list(self.atoms.keys())[list(self.atoms.values()).index(atom)]
        return idx

[docs]    def getBoundMols(self, h_id, aas_only=False):
        """
        Get a list of molecules bound to a heteroatom, usually a metal.

        Parameters
        ----------
        h_id : int
            The index of the desired (hetero)atom origin
        aas_only : boolean
            Whether or not to only consider amino acids, defaults False

        Returns
        -------
        bound_mols : list
            List of monomer3D and/or mol3D instances of molecules bound to hetatm
        """
        bound_mols = []
        for b_id in self.atoms.keys():
            b = self.atoms[b_id]
            if self.atoms[h_id] not in self.bonds.keys():
                return None
            elif b in self.bonds[self.atoms[h_id]]:
                if self.getMolecule(b_id, aas_only) is not None:
                    bound_mols.append(self.getMolecule(b_id, aas_only))
        return bound_mols

[docs]    def readfrompdb(self, text):
        """
        Read PDB into a protein3D class instance.

        Parameters
        ----------
        text : str
            String of path to PDB file. Path may be local or global.
            May also be the text of a PDB file from the internet.
        """

        # read in PDB file
        if '.pdb' in text:  # means this is a filename
            self.pdbfile = text
            fname = text.split('.pdb')[0]
            with open(fname + '.pdb', 'r') as f:
                text = f.read()
            enter = '\n'
        else:
            enter = "\\n"

        # class attributes
        aas = {}
        hetmols = {}
        atoms = {}
        a_ids = {}
        chains = {}
        missing_atoms = {}
        missing_aas = []
        conf = []
        bonds = {}

        # get R and Rfree values (text is full file)
        if "R VALUE            (WORKING SET)" in text:
            temp = text.split("R VALUE            (WORKING SET)")
            temp2 = temp[-1].split()
            if temp2[1] != 'NULL':
                R = float(temp2[1])
            else:
                R = -100
            if temp2[8] != 'NULL':
                Rfree = float(temp2[8])
            else:
                Rfree = 100
        elif "R VALUE          (WORKING SET, NO CUTOFF)" in text:
            temp = text.split("R VALUE          (WORKING SET, NO CUTOFF)")
            temp2 = temp[-1].split()
            if temp2[1] != 'NULL':
                R = float(temp2[1])
            else:
                R = -100
            if temp2[10] != 'NULL':
                Rfree = float(temp2[10])
            else:
                Rfree = 100
        else:
            R = -100
            Rfree = 100

        # start getting missing amino acids
        if "M RES C SSSEQI" in text:
            text = text.split("M RES C SSSEQI")
            want = text[-1]
            text = text[0].split(enter)
            split = text[-1]
            want = want.split(split)
            for line in want:
                if line == want[-1]:
                    text = line
                    line = line.split(enter)
                    line = line[0]
                    text = text.replace(line, '')
                sp = line.split()
                if len(sp) > 2:
                    res_num = int(sp[2])
                    # Ignoring expression tags which are negative residues
                    if res_num > 0:
                        a = monomer3D(sp[0], sp[1], sp[2])
                        missing_aas.append(a)

        # start getting missing atoms
        if "M RES CSSEQI  ATOMS" in text:
            text = text.split("M RES CSSEQI  ATOMS")
            want = text[-1]
            text = text[0].split(enter)
            split = text[-1]
            want = want.split(split)
            for line in want:
                if line == want[-1]:
                    text = line
                    line = line.split(enter)
                    line = line[0]
                    text = text.replace(line, '')
                sp = line.split()
                if len(sp) > 2:
                    missing_atoms[(sp[1], sp[2])] = []
                    for atom in sp[3:]:
                        if atom != enter and atom[0] in ['C', 'N', 'O', 'H']:
                            missing_atoms[(sp[1], sp[2])].append(
                                atom3D(Sym=atom[0], greek=atom))
        # start getting amino acids, nucleotides and heteroatoms
        pa_dict = {'AltLoc': ""}
        if "ENDMDL" in text:
            text.split("ENDMDL")
            text = text[-2] + text[-1]
        text = text.split(enter)
        text = text[1:]
        for line in text:
            if line == text[-1]:
                text = line
                line = line.split(enter)
                line = line[0]
                text = text.replace(line, '')
            l_type = line[:6]
            if "ATOM" in l_type or "HETATM" in l_type:
                line = line.replace("\\'", "\'")
                a_dict = read_atom(line)
                if a_dict['ResName'] in globalvars().getAllAAs() or "ATOM" in l_type:
                    # have an amino acid or biomolecule monomer
                    a, aas, conf, chains, pa_dict, bonds = makeMol(a_dict, aas, conf, chains, pa_dict, bonds)
                else:  # have a normal heteromolecule
                    a, hetmols, conf, chains, pa_dict, bonds = makeMol(a_dict, hetmols, conf, chains, pa_dict, bonds, False)
                atoms[a_dict['SerialNum']] = a
                a_ids[a] = a_dict['SerialNum']

            elif "CONECT" in l_type:  # get extra connections
                line = line[6:]  # remove type
                li = [line[i:i+5] for i in range(0, len(line), 5)]
                if int(li[0]) in atoms.keys() and atoms[int(li[0])] not in bonds.keys():
                    bonds[atoms[int(li[0])]] = set()
                for i in li[1:]:
                    try:
                        bonds[atoms[int(li[0])]].add(atoms[int(i)])
                        if atoms[int(li[0])].loc != '':
                            for j in {1, -1}:
                                if atoms[int(li[0]) + j].greek == atoms[int(li[0])].greek:
                                    if atoms[int(li[0]) + j] not in bonds.keys():
                                        bonds[atoms[int(li[0]) + j]] = {atoms[int(i)]}
                                    else:
                                        bonds[atoms[int(li[0]) + j]].add(atoms[int(i)])
                                    if atoms[int(i)] not in bonds.keys():
                                        bonds[atoms[int(i)]] = {atoms[int(li[0]) + j]}
                                    else:
                                        bonds[atoms[int(i)]].add(atoms[int(li[0]) + j])
                    except ValueError:
                        # if "  " not in i and i != " ":
                        #    print("likely OXT")
                        continue
        # deal with conformations in chains
        for i in conf:
            if i in aas.keys():
                c = aas[i]
            else:
                c = hetmols[i]
            for j in range(len(c)):
                # pick chain with higher occupancy or the A chain if tie
                if type(c[j]) == mol3D:
                    for a in c[j].atoms:
                        full = True
                        if a.occup <= 1/len(c):
                            full = False
                    if full:
                        chains[i[0]].append(c[j])
                    elif c[j].atoms[0].occup*100 == 100//len(c) and j == 0:
                        chains[i[0]].append(c[j])
                elif c[j].occup > 1/len(c):
                    chains[i[0]].append(c[j])
                elif c[j].occup*100 == 100//len(c) and j == 0:
                    chains[i[0]].append(c[j])
        self.setChains(chains)
        self.setAAs(aas)
        self.setAtoms(atoms)
        self.setIndices(a_ids)
        self.setHetmols(hetmols)
        self.setMissingAtoms(missing_atoms)
        self.setMissingAAs(missing_aas)
        self.setConf(conf)
        self.setR(R)
        self.setRfree(Rfree)
        self.setBonds(bonds)

[docs]    def fetch_pdb(self, pdbCode):
        """
        API query to fetch a pdb and write it as a protein3D class instance

        Parameters
        ----------
        pdbCode : str
            Code for protein, e.g. 1os7
        """
        remoteCode = pdbCode.upper()
        try:
            data = urllib.request.urlopen(
                'https://files.rcsb.org/view/' + remoteCode +
                '.pdb').read()
        except urllib.error.URLError:
            print("warning: %s not found.\n" % pdbCode)
        else:
            try:
                self.readfrompdb(str(data))
                self.setPDBCode(pdbCode)
                print("fetched: %s" % (pdbCode))
            except IOError:
                print('aborted')
            else:
                if len(data) == 0:
                    print("warning: %s not valid.\n" % pdbCode)

[docs]    def setBonds(self, bonds):
        """
        Sets the bonded atoms in the protein.

        This is effectively the molecular graph.

        Parameters
        ----------
        bonds : dictionary
            Keyed by atom3D atoms in the protein
            Valued by a set consisting of bonded atoms
        """
        self.bonds = bonds

[docs]    def readMetaData(self):
        """
        API query to fetch XML data from a pdb and add its useful attributes
        to a protein3D class.

        Parameters
        ----------
        pdbCode : str
            Code for protein, e.g. 1os7
        """
        pdbCode = self.pdbCode
        try:
            start = 'https://files.rcsb.org/pub/pdb/validation_reports/' + pdbCode[1] + pdbCode[2]
            link = start + '/' + pdbCode + '/' + pdbCode + '_validation.xml'
            xml_doc = requests.get(link)
        except urllib.error.URLError:
            print("warning: %s not found.\n" % pdbCode)
        else:
            try:
                # We then use beautiful soup to read the XML doc. LXML is an XML reader.
                # The soup object is what we then use to parse!
                soup = BeautifulSoup(xml_doc.content, 'lxml-xml')

                # We can then use methods of the soup object to find "tags" within the XML file.
                # This is how we would extract sections.
                # This is an example of getting everything with a "sec" tag.
                body = soup.find_all('wwPDB-validation-information')
                entry = body[0].find_all("Entry")
                if "DataCompleteness" not in entry[0].attrs.keys():
                    self.setDataCompleteness(0)
                    print("warning: %s has no DataCompleteness." % pdbCode)
                else:
                    self.setDataCompleteness(float(entry[0].attrs["DataCompleteness"]))
                if "percent-RSRZ-outliers" not in entry[0].attrs.keys():
                    self.setRSRZ(100)
                    print("warning: %s has no RSRZ.\n" % pdbCode)
                else:
                    self.setRSRZ(float(entry[0].attrs["percent-RSRZ-outliers"]))
                if "TwinL" not in entry[0].attrs.keys():
                    print("warning: %s has no TwinL." % pdbCode)
                    self.setTwinL(0)
                else:
                    self.setTwinL(float(entry[0].attrs["TwinL"]))
                if "TwinL2" not in entry[0].attrs.keys():
                    print("warning: %s has no TwinL2." % pdbCode)
                    self.setTwinL2(0)
                else:
                    self.setTwinL2(float(entry[0].attrs["TwinL2"]))
            except IOError:
                print('aborted')
            else:
                if xml_doc is None:
                    print("warning: %s not valid.\n" % pdbCode)

[docs]    def setDataCompleteness(self, DataCompleteness):
        """
        Set DataCompleteness value of protein3D class.

        Parameters
        ----------
        DataCompleteness : float
            The desired new R value.
        """
        self.DataCompleteness = DataCompleteness

[docs]    def setTwinL(self, TwinL):
        """
        Set TwinL score of protein3D class.

        Parameters
        ----------
        TwinL : float
            The desired new TwinL score.
        """
        self.TwinL = TwinL

[docs]    def setTwinL2(self, TwinL2):
        """
        Set TwinL squared score of protein3D class.

        Parameters
        ----------
        TwinL2 : float
            The desired new TwinL squared score.
        """
        self.TwinL2 = TwinL2

[docs]    def setEDIAScores(self):
        """
        Sets the EDIA score of a protein3D class.

        Parameters
        ----------
        pdbCode : string
            The 4-character code of the protein3D class.
        """
        code = self.pdbCode
        cmd = ('curl -d \'{"edia":{ "pdbCode":"'+code+'"}}\' -H "Accept: application/json"'
               ' -H "Content-Type: application/json" -X POST https://proteins.plus/api/edia_rest -k')
        args = shlex.split(cmd)
        result = subprocess.Popen(args, stdout=subprocess.PIPE,
                                  stderr=subprocess.PIPE)
        result.wait()
        out, err = result.communicate()
        dict_str = out.decode("UTF-8")
        int_dict = ast.literal_eval(dict_str)
        res2 = subprocess.Popen(['curl', '-k', int_dict['location']],
                                stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        out2, err2 = res2.communicate()
        dict2_str = out2.decode("UTF-8")
        dictionary = ast.literal_eval(dict2_str)
        t = 5  # can change depending on how frequently to loop
        while dictionary["status_code"] == 202:
            res2 = subprocess.Popen(['curl', '-k ', int_dict['location']],
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            # print('sleeping', t)
            time.sleep(t)
            res2.wait()
            out2, err2 = res2.communicate()
            dict2_str = out2.decode("UTF-8")
            dictionary = ast.literal_eval(dict2_str)
        link = dictionary["atom_scores"]
        df = pd.read_csv(link, on_bad_lines='skip')

        for i, row in df.iterrows():
            EDIA = row["EDIA"]
            index = row["Infile id"]
            if index in self.atoms.keys():
                a = self.atoms[index]
                a.setEDIA(EDIA)
                if a.occup < 1:  # more than one conformation
                    subdf = df[df["Infile id"] == index+1]
                    if subdf.shape[0] == 0 and index+1 in self.atoms.keys():
                        self.atoms[index+1].setEDIA(EDIA)
                    elif subdf.shape[0] == 0 and index-1 in self.atoms.keys():
                        self.atoms[index-1].setEDIA(EDIA)
            else:
                print("OXT is missing")

[docs]    def setPDBCode(self, pdbCode):
        """
        Sets the 4-letter PDB code of a protein3D class instance

        Parameters
        ----------
        pdbCode : string
            Desired 4-letter PDB code
        """
        self.pdbCode = pdbCode

[docs]    def centermass(self):
        """Computes coordinates of center of mass of protein.

        """

        center_of_mass = [0, 0, 0]  # coordinates of center of mass (X, Y, Z)
        mmass = 0
        # loop over atoms in molecule
        if len(self.atoms.keys()) > 0:
            for atom in self.atoms.values():
                # calculate center of mass (relative weight according to atomic mass)
                xyz = atom.coords()
                center_of_mass[0] += xyz[0] * atom.mass
                center_of_mass[1] += xyz[1] * atom.mass
                center_of_mass[2] += xyz[2] * atom.mass
                mmass += atom.mass
            # normalize
            center_of_mass[0] /= mmass
            center_of_mass[1] /= mmass
            center_of_mass[2] /= mmass
        else:
            center_of_mass = False
            print(
                'ERROR: Center of mass calculation failed. Structure will be inaccurate.\n')
        self.com = center_of_mass

[docs]    def setCentroid(self):
        """Computes coordinates of center of mass of protein.

        """

        centroid = [0, 0, 0]  # coordinates of centroid (X, Y, Z)
        # loop over atoms in protein
        if len(self.atoms.keys()) > 0:
            for atom in self.atoms.values():
                # calculate center of mass (relative weight according to atomic mass)
                xyz = atom.coords()
                centroid[0] += xyz[0]
                centroid[1] += xyz[1]
                centroid[2] += xyz[2]
            # normalize
            centroid[0] /= len(self.atoms.keys())
            centroid[1] /= len(self.atoms.keys())
            centroid[2] /= len(self.atoms.keys())
        else:
            centroid = False
            print(
                'ERROR: Centroid calculation failed. Structure will be inaccurate.\n')
        self.centroid = centroid

[docs]    def convexhull(self):
        """
        Computes convex hull of protein.

        Returns
        -------
        hull : array
            Coordinates of convex hull.
        """
        points = []
        # loop over atoms in protein
        if len(self.atoms.keys()) > 0:
            for atom in self.atoms.values():
                points.append(atom.coords())
            hull = ConvexHull(points)
        else:
            hull = False
            print(
                'ERROR: Convex hull calculation failed. Structure will be inaccurate.\n')
        self.hull = hull