Source code for mrdna.readers.segmentmodel_from_pdb

# -*- coding: utf-8 -*-
[docs] def printv(v): return " ".join([str(x) for x in v])
import pdb import numpy as np import os,sys from glob import glob import re import MDAnalysis as mda import MDAnalysis.analysis.align as align from MDAnalysis.analysis.distances import distance_array from ..model.CanonicalNucleotideAtoms import CanonicalNucleotideFactory from ..segmentmodel import SegmentModel, SingleStrandedSegment, DoubleStrandedSegment from .. import get_resource_path from .segmentmodel_from_lists import model_from_basepair_stack_3prime from .segmentmodel_from_lists import _three_prime_list_to_five_prime from .segmentmodel_from_lists import find_stacks resnames = dict(A='ADE DA DA5 DA3', T='THY DT DT5 DT3', C='CYT DC DC5 DC3', G='GUA DG DG5 DG3') # bp_bond_atoms = dict( A='N1', T='H3', # C='N3', G='H1' ) bp_bond_atoms = dict( A='N1', T='N3', C='N3', G='N1' ) bases = resnames.keys() resname_to_key = {n:k for k,v in resnames.items() for n in v.split()} complement = dict(A='T', C='G', T='A', G='C') refUnis = {b:mda.Universe(str(get_resource_path("generate-nts/1-w3dna/{}{}.pdb".format(b,complement[b])))) for b in bases} stack_text = "not name O5' P O1P O2P OP1 OP2 O3' 1H* 2H* 3H* 0H* H*" ref_name_ids_inv = dict() for key,val in refUnis.items(): ref_res = val.residues[0] names = ref_res.atoms.select_atoms( stack_text ).atoms.names ref_name_ids_inv[key] = np.argsort(np.argsort(names)) val.atoms.positions = val.atoms.positions.dot(CanonicalNucleotideFactory.DefaultOrientation) # return to default orientation ## TODO: possibly improve ref_stack_position # avg_stack_position = [ru.residues[0].atoms.select_atoms(stack_text).center_of_mass() # for k,ru in refUnis.items()] # avg_stack_position = np.mean(avg_stack_position,axis=0) # print(avg_stack_position) ## Obtained by apply stack transform to base ref_stack_position = np.array((-2.41851735, -0.259761333, 3.39999978)) ref_bp_position = np.array((0, -8.7434375, -0.326354))
[docs] def read_ter_from_pdb( *args ): pdbfile = args[0] for arg in args: if arg[-3:].lower() == "pdb": pdbfile = arg ter = [] with open(pdbfile) as fh: residue = -1 last = "" for line in fh: if line[:3] == "TER": ter.append(residue) elif line[:3] == "ATO" or line[:3] == "HET": if last == "" or int(last[22:26]) != int(line[22:26]): residue += 1 last = line return ter
[docs] def residues_within( cutoff, coords1, coords2, sel1, sel2 ): """ Returns lists of index of residues within sel1 and sel2 such that sel1.residue[arr_idx1[i]] is within cutoff of sel2.residue[arr_idx2[i]] for all i Also returns distance_matrix Ignores residues that are the same in sel1 and sel2 """ assert(len(sel1.atoms) > 0 and len(sel2.atoms) > 0) assert(len(sel1.residues) == len(coords1)) assert(len(sel2.residues) == len(coords2)) distance_matrix = distance_array( coords1, coords2 ) ## Ignore comparisons with self res_diff = sel1.residues.resindices[:,np.newaxis] - sel2.residues.resindices[np.newaxis,:] assert( res_diff.shape == distance_matrix.shape ) distance_matrix[ res_diff == 0 ] = distance_matrix[ res_diff == 0 ] + 10 *cutoff arr_idxs = np.where((distance_matrix < cutoff)) arr_idx1,arr_idx2 = arr_idxs return arr_idx1, arr_idx2, distance_matrix
[docs] def find_base_position_orientation( u, selection_text='nucleic' ): ## Find orientation and center of each nucleotide transforms,centers = [[],[]] for r in u.select_atoms(selection_text).residues: key = resname_to_key[r.resname] ref_res = refUnis[key].residues[0] ref,sel = [res.atoms.select_atoms( stack_text ) for res in (ref_res,r)] assert(len(ref) == len(sel)) ## Find order for sel inv = ref_name_ids_inv[key] # sel_order = inv[np.argsort(sel.atoms.names)] sel_order = np.argsort(sel.atoms.names)[inv] assert( (ref.atoms.names == sel.atoms.names[sel_order]).all() ) ## Reorder coordinates in sel so atom names match if np.all(sel.atoms.masses == 0): sel.atoms.masses = [1]*len(sel.atoms) if np.all(ref.atoms.masses == 0): ref.atoms.masses = [1]*len(ref.atoms) c = sel.atoms.center_of_mass() R,rmsd = align.rotation_matrix(sel.positions[sel_order] - c, ref.positions - ref.atoms.center_of_mass()) R = np.array(R) # convert from mda matrix to np array transforms.append(R) centers.append(c) centers = np.array(centers, dtype=np.float32) transforms = np.array(transforms) return centers,transforms
[docs] def find_basepairs( u, centers, transforms, selection_text='nucleic' ): bonds = { b:"resname {} and name {}".format(resnames[b],bp_bond_atoms[b]) for b in bases } all_ = u.select_atoms(selection_text) sel1 = u.select_atoms("({}) and (({}) or ({}))".format(selection_text, bonds['A'],bonds['C'])) sel2 = u.select_atoms("({}) and (({}) or ({}))".format(selection_text, bonds['T'],bonds['G'])) allres = all_.residues.resindices basepairs = -np.ones(len(allres), dtype=int) ## Find likely candidates for basepairs idx1, idx2, dists = residues_within( cutoff = 3.8, coords1=sel1.positions, coords2=sel2.positions, sel1 = sel1, sel2 = sel2 ) ## Find mapping from idx1/idx2 to index of same residue in all_ def get_idx_to_all(sel): selres = sel.residues.resindices return np.searchsorted( allres, selres ) idx1_to_all, idx2_to_all = [ get_idx_to_all(sel) for sel in (sel1,sel2) ] ## Rank possible basepairs by expected position and angles between base orientation rank = 100*np.ones(dists.shape) for i in np.unique(idx1): all1 = idx1_to_all[i] R1 = transforms[all1] c1 = centers[all1] c2_expected = c1 + ref_bp_position.dot(R1) for j in idx2[ idx1 == i ]: all2 = idx2_to_all[j] R2 = transforms[all2] c2 = centers[all2] c1_expected = c2 + ref_bp_position.dot(R2) dist1 = np.linalg.norm(c1_expected-c1) dist2 = np.linalg.norm(c2_expected-c2) angle= R1.T.dot(np.array((0,0,1))).dot(R2.T.dot(np.array((0,0,1)))) rank[i,j] = 0.5*dist1**2 + 0.5*dist2**2 + 135*(1+angle)**2 flatrank = rank.flatten() IDX = np.argsort(rank, axis=None) IDX = IDX[:np.sum(flatrank < 25)] # truncate I = IDX//rank.shape[1] J = IDX - I * rank.shape[1] ALL1 = idx1_to_all[I] ALL2 = idx2_to_all[J] for all1,all2 in zip(ALL1,ALL2): if basepairs[all1] >= 0 or basepairs[all2] >= 0: continue basepairs[all1] = all2 basepairs[all2] = all1 # checksum = np.sum((basepairs[ALL1] < 0) + (basepairs[ALL2] < 0)) # if checksum > 0: # print(checksum) # import pdb # pdb.set_trace() # ... return basepairs
[docs] def find_stacks_mdanalysis( u, centers, transforms, selection_text='nucleic' ): ## Find orientation and center of each nucleotide expected_stack_positions = [] for R,c in zip(transforms,centers): expected_stack_positions.append( c + ref_stack_position.dot(R) ) expected_stack_positions = np.array(expected_stack_positions, dtype=np.float32) sel = u.select_atoms("({}) and name C1'".format( selection_text )) idx1, idx2, dists = residues_within( cutoff = 3.5, coords1 = centers, coords2 = expected_stack_positions, sel1 = sel, sel2 = sel ) ## Convert distances to stacks stacks_above = -np.ones(len(sel), dtype=int) _z = np.array((0,0,1)) for i in np.unique(idx1): js = idx2[ idx1 == i ] angles = [np.arccos( transforms[j].T.dot( transforms[i].dot(_z) ).dot( _z ) ) for j in js] angles = np.array( angles ) tmp = np.argmin(dists[i][js] + 1.0*angles) j = js[tmp] stacks_above[i] = j return stacks_above
[docs] def basepairs_and_stacks_to_helixmap(basepairs,stacks_above): helixmap = -np.ones(basepairs.shape, dtype=int) helixrank = -np.ones(basepairs.shape) is_fwd = np.ones(basepairs.shape, dtype=int) ## Remove stacks with nts lacking a basepairs nobp = np.where(basepairs < 0)[0] stacks_above[nobp] = -1 stacks_with_nobp = np.in1d(stacks_above, nobp) stacks_above[stacks_with_nobp] = -1 end_ids = np.where( (stacks_above < 0)*(basepairs >= 0) )[0] hid = 0 for end in end_ids: if helixmap[end] >= 0: continue rank = 0 nt = basepairs[end] bp = basepairs[nt] assert(helixmap[nt] == -1) assert(helixmap[bp] == -1) helixmap[nt] = helixmap[bp] = hid helixrank[nt] = helixrank[bp] = rank is_fwd[bp] = 0 rank +=1 while stacks_above[nt] >= 0: nt = stacks_above[nt] if basepairs[nt] < 0: break bp = basepairs[nt] assert(helixmap[nt] == -1) assert(helixmap[bp] == -1) helixmap[nt] = helixmap[bp] = hid helixrank[nt] = helixrank[bp] = rank is_fwd[bp] = 0 rank +=1 hid += 1 return helixmap, helixrank, is_fwd
[docs] def get_crossovers(helixmap,helixrank,stacks_above): ...
[docs] def update_helixmap_with_ssdna(helixmap,helixrank,stacks_above): ... ss_residues = hmap < 0 helixmap
[docs] def break_into_contiguous_groups(sequence): from operator import itemgetter from itertools import groupby return [map(itemgetter(1),g) for i,g in groupby(enumerate(sequence), lambda i,x:i-x)]
[docs] def get_lists_from_pdb(*args, **kwargs): u = mda.Universe(*args,**kwargs) for a in u.select_atoms("resname DT* THY and name C7").atoms: a.name = "C5M" ter = read_ter_from_pdb(*args) ## Find basepairs and base stacks centers,transforms = find_base_position_orientation(u) bps = find_basepairs(u, centers, transforms) # stacks = find_stacks_mdanalysis(u, centers, transforms) stacks = find_stacks(centers, transforms) ## Find three-prime ends three_prime = -np.ones(bps.shape, dtype=int) for s in u.segments: r = s.atoms.select_atoms("name C1'").resindices if u.segments[0] == s: print(r) # three_prime[r[:-1]] = r[1:] three_prime[r[:-1]] = r[1:] if u.segments[0] == s: print(three_prime[r]) """ ## Remove connections between resids that are very far away dist = centers[r[1:]]-centers[r[:-1]] dist = np.sqrt( np.sum( dist**2, axis=-1 ) ) tmp = np.where( dist > 20 )[0] three_prime[r[tmp]] = -1 """ ## MDanalysis fails to parse chains based on TER, so find which residues should be broken here three_prime[ter] = -1 ## Some pdbs might not follow the 5-to-3-prime direction convention; find if this is the case ids = bps >= 0 if np.mean(stacks[ids] == three_prime[ids]) < 0.25: five_prime = _three_prime_list_to_five_prime(three_prime) if np.mean(stacks[ids] == five_prime[ids]) < 0.5: print("WARNING: PDB might not follow a consistent convention for 5'-to-3' direction") else: three_prime = five_prime ## Find sequence seq = [resname_to_key[rn] for rn in u.select_atoms("name C1'").resnames] return centers, bps, stacks, three_prime, seq, np.array([t.T for t in transforms])
[docs] def SegmentModelFromPdb(*args,**kwargs): return model_from_basepair_stack_3prime( *get_lists_from_pdb(*args,**kwargs) )
if __name__ == "__main__": # u = mda.Universe("test-ssdna.psf", "test-ssdna.pdb") # u = mda.Universe("test-ssdna.pdb") # model = SegmentModelFromPdb("test-ssdna.pdb") # model = SegmentModelFromPdb("nanoengineer2pdb/nanoengineer2pdb.pdb") model = SegmentModelFromPdb("test-twist.pdb") # model = SegmentModelFromPdb("/data/server7/6hb.pdb") model._clear_beads() model._generate_atomic_model(scale=1.0) model.write_atomic_ENM( 'test-from-pdb' ) model.atomic_simulate( outputPrefix = 'test-from-pdb' )