236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522 | def model_from_basepair_stack_3prime(coordinate, basepair, stack, three_prime,
sequence=None, orientation=None,
rank=None, explicitly_unstacked = tuple(),
max_basepairs_per_bead = 5,
max_nucleotides_per_bead = 5,
local_twist = False,
dimensions=(5000,5000,5000),
**model_parameters):
"""
Creates a SegmentModel object from lists of each nucleotide's
basepair, its stack (on 3' side) and its 3'-connected nucleotide
The first argument should be an N-by-3 numpy array containing the
coordinate of each nucleotide, where N is the number of
nucleotides. The following three arguments should be integer lists
where the i-th element corresponds to the i-th nucleotide; the
list element should the integer index of the corresponding
basepaired / stacked / phosphodiester-bonded nucleotide. If there
is no such nucleotide, the value should be -1.
By default, where there is a 3'-to-5' connection between
double-straned helices, they will be connected by an intrahelical
connection that makes the helical domain continue across the
connection, even if they do not stack in the stacking
array. Instead, a "terminal_crossover" representing a relatively
free (e.g. kinked) junction can be placed by including the index
of one of the participating bases in the optional
:explicitly_unstacked: tuple.
Args:
basepair: List of each nucleotide's basepair's index
stack: List containing index of the nucleotide stacked on the 3' of each nucleotide
three_prime: List of each nucleotide's the 3' end of each nucleotide
Returns:
SegmentModel
"""
""" Validate Input """
inputs = (basepair,three_prime)
try:
basepair,three_prime = [np.array(a,dtype=int) for a in inputs]
except:
raise TypeError("One or more of the input lists could not be converted into a numpy array")
inputs = (basepair,three_prime)
coordinate = np.array(coordinate)
if np.any( [len(a.shape) > 1 for a in inputs] ):
raise ValueError("One or more of the input lists has the wrong dimensionality")
if len(coordinate.shape) != 2:
raise ValueError("Coordinate array has the wrong dimensionality")
inputs = (coordinate,basepair,three_prime)
if not np.all(np.diff([len(a) for a in inputs]) == 0):
raise ValueError("Inputs are not the same length")
num_nt = len(basepair)
if sequence is not None and len(sequence) != num_nt:
raise ValueError("The 'sequence' parameter is the wrong length {} != {}".format(len(sequence),num_nt))
if orientation is not None:
orientation = np.array(orientation)
if len(orientation.shape) != 3:
raise ValueError("The 'orientation' array has the wrong dimensionality (should be Nx3x3)")
if orientation.shape != (num_nt,3,3):
raise ValueError("The 'orientation' array is not properly formatted")
if stack is None:
if orientation is not None:
stack = find_stacks(coordinate, orientation)
else:
## Guess stacking based on 3' connectivity
stack = np.array(three_prime,dtype=int) # Assume nts on 3' ends are stacked
_stack_below = _three_prime_list_to_five_prime(stack)
_has_bp = (basepair >= 0)
_nostack = np.where( (stack == -1)*_has_bp )[0]
_has_stack_below = _stack_below[basepair[_nostack]] >= 0
_nostack2 = _nostack[_has_stack_below]
stack[_nostack2] = basepair[_stack_below[basepair[_nostack2]]]
else:
try:
stack = np.array(stack,dtype=int)
except:
raise TypeError("The 'stack' array could not be converted into a numpy integer array")
if len(stack.shape) != 1:
raise ValueError("The 'stack' array has the wrong dimensionality")
if len(stack) != num_nt:
raise ValueError("The length of the 'stack' array does not match other inputs")
bps = basepair # alias
""" Fix stacks: require that the stack of a bp of a base's stack is its bp """
_has_bp = (bps >= 0)
_has_stack = (stack >= 0)
_stack_has_basepair = (bps[stack] >= 0) * _has_stack
stack = np.where( (stack[bps[stack]] == bps) * _has_bp * _has_stack * _has_bp,
stack, -np.ones(len(stack),dtype=int) )
five_prime = _three_prime_list_to_five_prime(three_prime)
""" Build map of dsDNA helices and strands """
hmap,hrank,fwd,intrahelical = basepairs_and_stacks_to_helixmap(bps,stack,rank)
double_stranded_helices = np.unique(hmap[hmap >= 0])
strands, strand_is_circular = _primes_list_to_strands(three_prime, five_prime)
""" Add ssDNA to hmap """
if len(double_stranded_helices) > 0:
hid = double_stranded_helices[-1]+1
else:
hid = 0
ss_residues = hmap < 0
#
if np.any(bps[ss_residues] != -1):
logger.warning(f'{np.sum(bps[ss_residues] != -1)} ssDNA nucleotides appear to have basepairs... ignoring')
for s,c in zip(strands, strand_is_circular):
strand_segment_ends = [i for i in np.where( np.diff(hmap[s]) != 0 )[0]] + [len(s)-1]
seg_start = 0
for i in strand_segment_ends:
if hmap[s[i]] < 0:
## Found single-stranded segment
ids = s[seg_start:i+1]
assert( np.all(hmap[ids] == -1) )
hmap[ids] = hid
hrank[ids] = np.arange(i+1-seg_start)
hid+=1
seg_start = i+1
if len(double_stranded_helices) > 0:
single_stranded_helices = np.arange(double_stranded_helices[-1]+1,hid)
else:
single_stranded_helices = np.arange(hid)
devlogger.info(f"Creating {len(double_stranded_helices)} dsDNA and {len(single_stranded_helices)} ssDNA segments")
## Create double-stranded segments
doubleSegments = []
for hid in double_stranded_helices:
seg = DoubleStrandedSegment(name=str(hid),
num_bp = np.sum(hmap==hid)//2,
_helix_idx = hid)
set_splines(seg, coordinate, hid, hmap, hrank, fwd, basepair, orientation)
assert(hid == len(doubleSegments))
doubleSegments.append(seg)
## Create single-stranded segments
singleSegments = []
for hid in single_stranded_helices:
seg = SingleStrandedSegment(name=str(hid),
num_nt = np.sum(hmap==hid),
_helix_idx = hid)
set_splines(seg, coordinate, hid, hmap, hrank, fwd, basepair, orientation)
assert(hid == len(doubleSegments) + len(singleSegments))
singleSegments.append(seg)
devlogger.debug("Finding crossovers and strand ends")
## Find crossovers and 5prime/3prime ends
crossovers,prime5,prime3 = [[],[],[]]
for s,c in zip(strands,strand_is_circular):
tmp = np.where(np.diff(hmap[s]) != 0)[0]
for i in tmp:
crossovers.append( (s[i],s[i+1]) )
if c:
if hmap[s[-1]] != hmap[s[0]]:
crossovers.append( (s[-1],s[0]) )
else:
prime5.append(s[0])
prime3.append(s[-1])
## Add connections
allSegments = doubleSegments+singleSegments
devlogger.debug(f"Adding {len(crossovers)} connections")
for r1,r2 in crossovers:
seg1,seg2 = [allSegments[hmap[i]] for i in (r1,r2)]
nt1,nt2 = [hrank[i] for i in (r1,r2)]
f1,f2 = [fwd[i] for i in (r1,r2)]
## Handle connections at the ends
is_terminal1 = (nt1,f1) in ((0,0),(seg1.num_nt-1,1))
is_terminal2 = (nt2,f2) in ((0,1),(seg2.num_nt-1,0))
# devlogger.info(seg1,seg2, r1, r2, is_terminal1, is_terminal2)
if is_terminal1 or is_terminal2:
""" Ensure that we don't have three-way dsDNA junctions """
if is_terminal1 and (bps[r1] >= 0) and (five_prime[bps[r1]] >= 0) and (three_prime[r1] >= 0):
if (bps[five_prime[bps[r1]]] >= 0) and (bps[three_prime[r1]] >= 0):
# is_terminal1 = (three_prime[r1] == bps[five_prime[bps[r1]]])
is_terminal1 = hmap[five_prime[bps[r1]]] == hmap[three_prime[r1]]
if is_terminal2 and (bps[r2] >= 0) and (three_prime[bps[r2]] >= 0) and (five_prime[r2] >= 0):
if (bps[three_prime[bps[r2]]] >= 0) and (bps[five_prime[r2]] >= 0):
# is_terminal2 = (five_prime[r2] == bps[three_prime[bps[r2]]])
is_terminal2 = hmap[three_prime[bps[r2]]] == hmap[five_prime[r2]]
""" Place connection """
if is_terminal1 and is_terminal2 and (r1 not in explicitly_unstacked) and (r2 not in explicitly_unstacked):
end1 = seg1.end3 if f1 else seg1.start3
end2 = seg2.start5 if f2 else seg2.end5
seg1._connect_ends( end1, end2, type_='intrahelical')
else:
seg1.add_crossover(nt1,seg2,nt2,[f1,f2],type_="terminal_crossover")
else:
seg1.add_crossover(nt1,seg2,nt2,[f1,f2])
devlogger.debug(f"Converting terminal crossovers to regular ones if other crossovers are nearby")
for seg1 in doubleSegments:
cAB = seg1.get_connections_and_locations(exclude='intrahelical')
conns = {}
for c,A,B in cAB:
seg2 = B.container
if seg2 not in conns: conns[seg2] = []
assert('crossover' in c.type_)
conns[seg2].append(c)
nt_pos = {c:A.get_nt_pos() for c,A,B in cAB}
for c,A,B in cAB:
if c.type_ != 'terminal_crossover': continue
seg2 = B.container
if seg2 not in doubleSegments: continue
dists = np.abs(np.array([nt_pos[c] - nt_pos[c2] for c2 in conns[seg2]]))
# devlogger.debug(f'Terminal crossover {c} distance: {dists}')
if np.sum((dists > 0) & (dists < 75)) > 0:
# devlogger.debug(f'Fixing crossover at {c,A,B}')
c.type_ = 'crossover'
devlogger.debug(f"Adding prime ends and dealing with circular helices")
## Add 5prime/3prime ends
for r in prime5:
seg = allSegments[hmap[r]]
seg.add_5prime(hrank[r],fwd[r])
for r in prime3:
seg = allSegments[hmap[r]]
seg.add_3prime(hrank[r],fwd[r])
## Add intrahelical connections to circular helical sections
for nt0,nt1 in intrahelical:
seg = allSegments[hmap[nt0]]
assert( seg is allSegments[hmap[nt1]] )
if three_prime[nt0] >= 0:
if hmap[nt0] == hmap[three_prime[nt0]]:
seg.connect_end3(seg.start5)
bp0,bp1 = [bps[nt] for nt in (nt0,nt1)]
if three_prime[bp1] >= 0:
if hmap[bp1] == hmap[three_prime[bp1]]:
seg.connect_start3(seg.end5)
devlogger.debug(f"Adding sequence")
## Assign sequence
if sequence is not None:
for hid in range(len(allSegments)):
resids = np.where( (hmap==hid)*(fwd==1) )[0]
s = allSegments[hid]
s.sequence = [sequence[r] for r in sorted(resids,key=lambda x: hrank[x])]
devlogger.debug(f"Building model")
## Build model
model = SegmentModel( allSegments,
max_basepairs_per_bead = max_basepairs_per_bead,
max_nucleotides_per_bead = max_nucleotides_per_bead,
local_twist = local_twist,
dimensions = dimensions,
**model_parameters )
devlogger.debug(f"Adding _reader_list attributes and randomizing unset sequences")
model._reader_list_coordinates = coordinate
model._reader_list_basepair = basepair
model._reader_list_stack = stack
model._reader_list_three_prime = three_prime
model._reader_list_five_prime = five_prime
model._reader_list_sequence = sequence
model._reader_list_orientation = orientation
model._reader_list_hmap = hmap
model._reader_list_fwd = fwd
model._reader_list_hrank = hrank
if sequence is None:
for s in model.segments:
s.randomize_unset_sequence()
return model
|