# -*- coding: utf-8 -*-
[docs]
def printv(v):
return " ".join([str(x) for x in v])
import pdb
import numpy as np
import os,sys
from glob import glob
import re
import MDAnalysis as mda
import MDAnalysis.analysis.align as align
from MDAnalysis.analysis.distances import distance_array
from ..model.CanonicalNucleotideAtoms import CanonicalNucleotideFactory
from ..segmentmodel import SegmentModel, SingleStrandedSegment, DoubleStrandedSegment
from .. import get_resource_path
from .segmentmodel_from_lists import model_from_basepair_stack_3prime
from .segmentmodel_from_lists import _three_prime_list_to_five_prime
from .segmentmodel_from_lists import find_stacks
resnames = dict(A='ADE DA DA5 DA3', T='THY DT DT5 DT3',
C='CYT DC DC5 DC3', G='GUA DG DG5 DG3')
# bp_bond_atoms = dict( A='N1', T='H3',
# C='N3', G='H1' )
bp_bond_atoms = dict( A='N1', T='N3',
C='N3', G='N1' )
bases = resnames.keys()
resname_to_key = {n:k for k,v in resnames.items() for n in v.split()}
complement = dict(A='T', C='G', T='A', G='C')
refUnis = {b:mda.Universe(str(get_resource_path("generate-nts/1-w3dna/{}{}.pdb".format(b,complement[b]))))
for b in bases}
stack_text = "not name O5' P O1P O2P OP1 OP2 O3' 1H* 2H* 3H* 0H* H*"
ref_name_ids_inv = dict()
for key,val in refUnis.items():
ref_res = val.residues[0]
names = ref_res.atoms.select_atoms( stack_text ).atoms.names
ref_name_ids_inv[key] = np.argsort(np.argsort(names))
val.atoms.positions = val.atoms.positions.dot(CanonicalNucleotideFactory.DefaultOrientation) # return to default orientation
## TODO: possibly improve ref_stack_position
# avg_stack_position = [ru.residues[0].atoms.select_atoms(stack_text).center_of_mass()
# for k,ru in refUnis.items()]
# avg_stack_position = np.mean(avg_stack_position,axis=0)
# print(avg_stack_position)
## Obtained by apply stack transform to base
ref_stack_position = np.array((-2.41851735, -0.259761333, 3.39999978))
ref_bp_position = np.array((0, -8.7434375, -0.326354))
[docs]
def read_ter_from_pdb( *args ):
pdbfile = args[0]
for arg in args:
if arg[-3:].lower() == "pdb":
pdbfile = arg
ter = []
with open(pdbfile) as fh:
residue = -1
last = ""
for line in fh:
if line[:3] == "TER":
ter.append(residue)
elif line[:3] == "ATO" or line[:3] == "HET":
if last == "" or int(last[22:26]) != int(line[22:26]):
residue += 1
last = line
return ter
[docs]
def residues_within( cutoff, coords1, coords2, sel1, sel2 ):
"""
Returns lists of index of residues within sel1 and sel2 such that
sel1.residue[arr_idx1[i]] is within cutoff of sel2.residue[arr_idx2[i]] for all i
Also returns distance_matrix
Ignores residues that are the same in sel1 and sel2
"""
assert(len(sel1.atoms) > 0 and len(sel2.atoms) > 0)
assert(len(sel1.residues) == len(coords1))
assert(len(sel2.residues) == len(coords2))
distance_matrix = distance_array( coords1, coords2 )
## Ignore comparisons with self
res_diff = sel1.residues.resindices[:,np.newaxis] - sel2.residues.resindices[np.newaxis,:]
assert( res_diff.shape == distance_matrix.shape )
distance_matrix[ res_diff == 0 ] = distance_matrix[ res_diff == 0 ] + 10 *cutoff
arr_idxs = np.where((distance_matrix < cutoff))
arr_idx1,arr_idx2 = arr_idxs
return arr_idx1, arr_idx2, distance_matrix
[docs]
def find_base_position_orientation( u, selection_text='nucleic' ):
## Find orientation and center of each nucleotide
transforms,centers = [[],[]]
for r in u.select_atoms(selection_text).residues:
key = resname_to_key[r.resname]
ref_res = refUnis[key].residues[0]
ref,sel = [res.atoms.select_atoms( stack_text ) for res in (ref_res,r)]
assert(len(ref) == len(sel))
## Find order for sel
inv = ref_name_ids_inv[key]
# sel_order = inv[np.argsort(sel.atoms.names)]
sel_order = np.argsort(sel.atoms.names)[inv]
assert( (ref.atoms.names == sel.atoms.names[sel_order]).all() )
## Reorder coordinates in sel so atom names match
if np.all(sel.atoms.masses == 0):
sel.atoms.masses = [1]*len(sel.atoms)
if np.all(ref.atoms.masses == 0):
ref.atoms.masses = [1]*len(ref.atoms)
c = sel.atoms.center_of_mass()
R,rmsd = align.rotation_matrix(sel.positions[sel_order] - c,
ref.positions - ref.atoms.center_of_mass())
R = np.array(R) # convert from mda matrix to np array
transforms.append(R)
centers.append(c)
centers = np.array(centers, dtype=np.float32)
transforms = np.array(transforms)
return centers,transforms
[docs]
def find_basepairs( u, centers, transforms, selection_text='nucleic' ):
bonds = { b:"resname {} and name {}".format(resnames[b],bp_bond_atoms[b])
for b in bases }
all_ = u.select_atoms(selection_text)
sel1 = u.select_atoms("({}) and (({}) or ({}))".format(selection_text,
bonds['A'],bonds['C']))
sel2 = u.select_atoms("({}) and (({}) or ({}))".format(selection_text,
bonds['T'],bonds['G']))
allres = all_.residues.resindices
basepairs = -np.ones(len(allres), dtype=int)
## Find likely candidates for basepairs
idx1, idx2, dists = residues_within( cutoff = 3.8,
coords1=sel1.positions,
coords2=sel2.positions,
sel1 = sel1, sel2 = sel2 )
## Find mapping from idx1/idx2 to index of same residue in all_
def get_idx_to_all(sel):
selres = sel.residues.resindices
return np.searchsorted( allres, selres )
idx1_to_all, idx2_to_all = [ get_idx_to_all(sel) for sel in (sel1,sel2) ]
## Rank possible basepairs by expected position and angles between base orientation
rank = 100*np.ones(dists.shape)
for i in np.unique(idx1):
all1 = idx1_to_all[i]
R1 = transforms[all1]
c1 = centers[all1]
c2_expected = c1 + ref_bp_position.dot(R1)
for j in idx2[ idx1 == i ]:
all2 = idx2_to_all[j]
R2 = transforms[all2]
c2 = centers[all2]
c1_expected = c2 + ref_bp_position.dot(R2)
dist1 = np.linalg.norm(c1_expected-c1)
dist2 = np.linalg.norm(c2_expected-c2)
angle= R1.T.dot(np.array((0,0,1))).dot(R2.T.dot(np.array((0,0,1))))
rank[i,j] = 0.5*dist1**2 + 0.5*dist2**2 + 135*(1+angle)**2
flatrank = rank.flatten()
IDX = np.argsort(rank, axis=None)
IDX = IDX[:np.sum(flatrank < 25)] # truncate
I = IDX//rank.shape[1]
J = IDX - I * rank.shape[1]
ALL1 = idx1_to_all[I]
ALL2 = idx2_to_all[J]
for all1,all2 in zip(ALL1,ALL2):
if basepairs[all1] >= 0 or basepairs[all2] >= 0:
continue
basepairs[all1] = all2
basepairs[all2] = all1
# checksum = np.sum((basepairs[ALL1] < 0) + (basepairs[ALL2] < 0))
# if checksum > 0:
# print(checksum)
# import pdb
# pdb.set_trace()
# ...
return basepairs
[docs]
def find_stacks_mdanalysis( u, centers, transforms, selection_text='nucleic' ):
## Find orientation and center of each nucleotide
expected_stack_positions = []
for R,c in zip(transforms,centers):
expected_stack_positions.append( c + ref_stack_position.dot(R) )
expected_stack_positions = np.array(expected_stack_positions, dtype=np.float32)
sel = u.select_atoms("({}) and name C1'".format( selection_text ))
idx1, idx2, dists = residues_within( cutoff = 3.5,
coords1 = centers,
coords2 = expected_stack_positions,
sel1 = sel, sel2 = sel )
## Convert distances to stacks
stacks_above = -np.ones(len(sel), dtype=int)
_z = np.array((0,0,1))
for i in np.unique(idx1):
js = idx2[ idx1 == i ]
angles = [np.arccos( transforms[j].T.dot( transforms[i].dot(_z) ).dot( _z ) ) for j in js]
angles = np.array( angles )
tmp = np.argmin(dists[i][js] + 1.0*angles)
j = js[tmp]
stacks_above[i] = j
return stacks_above
[docs]
def basepairs_and_stacks_to_helixmap(basepairs,stacks_above):
helixmap = -np.ones(basepairs.shape, dtype=int)
helixrank = -np.ones(basepairs.shape)
is_fwd = np.ones(basepairs.shape, dtype=int)
## Remove stacks with nts lacking a basepairs
nobp = np.where(basepairs < 0)[0]
stacks_above[nobp] = -1
stacks_with_nobp = np.in1d(stacks_above, nobp)
stacks_above[stacks_with_nobp] = -1
end_ids = np.where( (stacks_above < 0)*(basepairs >= 0) )[0]
hid = 0
for end in end_ids:
if helixmap[end] >= 0:
continue
rank = 0
nt = basepairs[end]
bp = basepairs[nt]
assert(helixmap[nt] == -1)
assert(helixmap[bp] == -1)
helixmap[nt] = helixmap[bp] = hid
helixrank[nt] = helixrank[bp] = rank
is_fwd[bp] = 0
rank +=1
while stacks_above[nt] >= 0:
nt = stacks_above[nt]
if basepairs[nt] < 0: break
bp = basepairs[nt]
assert(helixmap[nt] == -1)
assert(helixmap[bp] == -1)
helixmap[nt] = helixmap[bp] = hid
helixrank[nt] = helixrank[bp] = rank
is_fwd[bp] = 0
rank +=1
hid += 1
return helixmap, helixrank, is_fwd
[docs]
def get_crossovers(helixmap,helixrank,stacks_above):
...
[docs]
def update_helixmap_with_ssdna(helixmap,helixrank,stacks_above):
...
ss_residues = hmap < 0
helixmap
[docs]
def break_into_contiguous_groups(sequence):
from operator import itemgetter
from itertools import groupby
return [map(itemgetter(1),g)
for i,g in groupby(enumerate(sequence), lambda i,x:i-x)]
[docs]
def get_lists_from_pdb(*args, **kwargs):
u = mda.Universe(*args,**kwargs)
for a in u.select_atoms("resname DT* THY and name C7").atoms:
a.name = "C5M"
ter = read_ter_from_pdb(*args)
## Find basepairs and base stacks
centers,transforms = find_base_position_orientation(u)
bps = find_basepairs(u, centers, transforms)
# stacks = find_stacks_mdanalysis(u, centers, transforms)
stacks = find_stacks(centers, transforms)
## Find three-prime ends
three_prime = -np.ones(bps.shape, dtype=int)
for s in u.segments:
r = s.atoms.select_atoms("name C1'").resindices
if u.segments[0] == s: print(r)
# three_prime[r[:-1]] = r[1:]
three_prime[r[:-1]] = r[1:]
if u.segments[0] == s: print(three_prime[r])
""" ## Remove connections between resids that are very far away
dist = centers[r[1:]]-centers[r[:-1]]
dist = np.sqrt( np.sum( dist**2, axis=-1 ) )
tmp = np.where( dist > 20 )[0]
three_prime[r[tmp]] = -1
"""
## MDanalysis fails to parse chains based on TER, so find which residues should be broken here
three_prime[ter] = -1
## Some pdbs might not follow the 5-to-3-prime direction convention; find if this is the case
ids = bps >= 0
if np.mean(stacks[ids] == three_prime[ids]) < 0.25:
five_prime = _three_prime_list_to_five_prime(three_prime)
if np.mean(stacks[ids] == five_prime[ids]) < 0.5:
print("WARNING: PDB might not follow a consistent convention for 5'-to-3' direction")
else:
three_prime = five_prime
## Find sequence
seq = [resname_to_key[rn]
for rn in u.select_atoms("name C1'").resnames]
return centers, bps, stacks, three_prime, seq, np.array([t.T for t in transforms])
[docs]
def SegmentModelFromPdb(*args,**kwargs):
return model_from_basepair_stack_3prime( *get_lists_from_pdb(*args,**kwargs) )
if __name__ == "__main__":
# u = mda.Universe("test-ssdna.psf", "test-ssdna.pdb")
# u = mda.Universe("test-ssdna.pdb")
# model = SegmentModelFromPdb("test-ssdna.pdb")
# model = SegmentModelFromPdb("nanoengineer2pdb/nanoengineer2pdb.pdb")
model = SegmentModelFromPdb("test-twist.pdb")
# model = SegmentModelFromPdb("/data/server7/6hb.pdb")
model._clear_beads()
model._generate_atomic_model(scale=1.0)
model.write_atomic_ENM( 'test-from-pdb' )
model.atomic_simulate( outputPrefix = 'test-from-pdb' )