Skip to content

segmentmodel_from_lists

Module documentation for segmentmodel_from_lists.

API Reference

UNIT_circular()

Make a circular DNA strand, with dsDNA in the middle

Source code in mrdna/readers/segmentmodel_from_lists.py
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
def UNIT_circular():
    """ Make a circular DNA strand, with dsDNA in the middle """
    coordinate = [(0,0,3.4*i) for i in range(7)]
    stack = -1*np.ones(len(coordinate))
    three_prime = [ 1, 2, 4,-1, 6, 3, 0]
    basepair    = [-1,-1, 3, 2, 5, 4,-1]
    stack       = [-1,-1, 4,-1,-1, 3,-1]
    for i in [3,5]:
        coordinate[i] = (1,0,3.4*i)

    model = model_from_basepair_stack_3prime(coordinate, basepair, stack, three_prime,
                                             max_basepairs_per_bead=1,
                                             max_nucleotides_per_bead=1,
                                             local_twist=True)

    model.simulate(output_name='circular', directory='unittest')

basepairs_and_stacks_to_helixmap(basepairs, stacks_above, nucleotide_rank=None)

Decide how helices should be created from basepair and stack data. Optionally use nucleotide_rank to determine direction of each helix and order of helices

Source code in mrdna/readers/segmentmodel_from_lists.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def basepairs_and_stacks_to_helixmap(basepairs,stacks_above, nucleotide_rank=None):
    """ Decide how helices should be created from basepair and stack data. Optionally use nucleotide_rank to determine direction of each helix and order of helices """

    helixmap = -np.ones(basepairs.shape, dtype=int)
    helixrank = -np.ones(basepairs.shape)
    is_fwd = np.ones(basepairs.shape, dtype=int)

    if nucleotide_rank is None:
        nucleotide_rank = np.arange(len(basepairs), dtype=int)

    ## Remove stacks with nts lacking a basepairs
    nobp = np.where(basepairs < 0)[0]
    stacks_above[nobp] = -1
    stacks_with_nobp = np.in1d(stacks_above, nobp)
    stacks_above[stacks_with_nobp] = -1

    end_ids = np.where( (stacks_above < 0)*(basepairs >= 0) )[0]

    hid = 0
    for end in sorted(end_ids, key=lambda x: nucleotide_rank[x]):
        if helixmap[end] >= 0:
            continue
        rank = 0
        nt = basepairs[end]
        bp = basepairs[nt]
        assert( bp == end )
        if helixmap[nt] >= 0 or helixmap[bp] >= 0:
            logger.warning(f'Ill-formed helix: problematic basepair or stacking data near nucleotide {nt} or {bp}... skipping')
            continue
        # assert(helixmap[nt] == -1)
        # assert(helixmap[bp] == -1)
        helixmap[nt] = helixmap[bp] = hid
        helixrank[nt] = helixrank[bp] = rank
        is_fwd[bp] = 0
        rank +=1

        _tmp = [(nt,bp)]

        while stacks_above[nt] >= 0:
            nt = stacks_above[nt]
            if basepairs[nt] < 0: break
            bp = basepairs[nt]
            if helixmap[nt] >= 0 or helixmap[bp] >= 0:
                logger.warning(f'Ill-formed helix: problematic basepair or stacking data near nucleotide {nt} or {bp}... skipping')
                break
            helixmap[nt] = helixmap[bp] = hid
            helixrank[nt] = helixrank[bp] = rank
            is_fwd[bp] = 0
            _tmp.append((nt,bp))
            rank +=1

        hid += 1

    ## Create "helix" for each circular segment
    intrahelical = []
    processed = set()
    unclaimed_bases = np.where( (basepairs >= 0)*(helixmap == -1) )[0]
    for nt0 in unclaimed_bases:
        if nt0 in processed: continue

        nt = nt0
        all_nts = [nt]

        rank = 0
        nt = nt0
        bp = basepairs[nt]
        if helixmap[nt] >= 0 or helixmap[bp] >= 0:
            logger.warning(f'Ill-formed cylic helix: problematic basepair or stacking data near nucleotide {nt} or {bp}... skipping')
            continue
        helixmap[nt] = helixmap[bp] = hid
        helixrank[nt] = helixrank[bp] = rank
        is_fwd[bp] = 0
        rank +=1
        processed.add(nt)
        processed.add(bp)

        counter = 0
        while stacks_above[nt] >= 0:
            lastnt = nt
            nt = stacks_above[nt]
            bp = basepairs[nt]
            if nt == nt0 or nt == basepairs[nt0]:
                intrahelical.append((lastnt,nt0))
                break

            assert( bp >= 0 )
            if helixmap[nt] >= 0 or helixmap[bp] >= 0:
                logger.warning(f'Ill-formed cyclic helix: problematic basepair or stacking data near nucleotide {nt} or {bp}... skipping')
                break

            helixmap[nt] = helixmap[bp] = hid
            helixrank[nt] = helixrank[bp] = rank
            is_fwd[bp] = 0
            processed.add(nt)
            processed.add(bp)
            rank +=1
        hid += 1

    return helixmap, helixrank, is_fwd, intrahelical

model_from_basepair_stack_3prime(coordinate, basepair, stack, three_prime, sequence=None, orientation=None, rank=None, explicitly_unstacked=tuple(), max_basepairs_per_bead=5, max_nucleotides_per_bead=5, local_twist=False, dimensions=(5000, 5000, 5000), **model_parameters)

Creates a SegmentModel object from lists of each nucleotide's basepair, its stack (on 3' side) and its 3'-connected nucleotide

The first argument should be an N-by-3 numpy array containing the coordinate of each nucleotide, where N is the number of nucleotides. The following three arguments should be integer lists where the i-th element corresponds to the i-th nucleotide; the list element should the integer index of the corresponding basepaired / stacked / phosphodiester-bonded nucleotide. If there is no such nucleotide, the value should be -1.

By default, where there is a 3'-to-5' connection between double-straned helices, they will be connected by an intrahelical connection that makes the helical domain continue across the connection, even if they do not stack in the stacking array. Instead, a "terminal_crossover" representing a relatively free (e.g. kinked) junction can be placed by including the index of one of the participating bases in the optional :explicitly_unstacked: tuple.

Parameters:

Name Type Description Default
basepair

List of each nucleotide's basepair's index

required
stack

List containing index of the nucleotide stacked on the 3' of each nucleotide

required
three_prime

List of each nucleotide's the 3' end of each nucleotide

required

Returns:

Type Description

SegmentModel

Source code in mrdna/readers/segmentmodel_from_lists.py
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
def model_from_basepair_stack_3prime(coordinate, basepair, stack, three_prime,
                                     sequence=None, orientation=None,
                                     rank=None, explicitly_unstacked = tuple(),
                                     max_basepairs_per_bead = 5,
                                     max_nucleotides_per_bead = 5,
                                     local_twist = False,
                                     dimensions=(5000,5000,5000),
                                     **model_parameters):
    """ 
    Creates a SegmentModel object from lists of each nucleotide's
    basepair, its stack (on 3' side) and its 3'-connected nucleotide

    The first argument should be an N-by-3 numpy array containing the
    coordinate of each nucleotide, where N is the number of
    nucleotides. The following three arguments should be integer lists
    where the i-th element corresponds to the i-th nucleotide; the
    list element should the integer index of the corresponding
    basepaired / stacked / phosphodiester-bonded nucleotide. If there
    is no such nucleotide, the value should be -1.

    By default, where there is a 3'-to-5' connection between
    double-straned helices, they will be connected by an intrahelical
    connection that makes the helical domain continue across the
    connection, even if they do not stack in the stacking
    array. Instead, a "terminal_crossover" representing a relatively
    free (e.g. kinked) junction can be placed by including the index
    of one of the participating bases in the optional
    :explicitly_unstacked: tuple.

    Args:
        basepair:  List of each nucleotide's basepair's index
        stack:  List containing index of the nucleotide stacked on the 3' of each nucleotide
        three_prime:  List of each nucleotide's the 3' end of each nucleotide

    Returns:
        SegmentModel
    """

    """ Validate Input """
    inputs = (basepair,three_prime)
    try:
        basepair,three_prime = [np.array(a,dtype=int) for a in inputs]
    except:
        raise TypeError("One or more of the input lists could not be converted into a numpy array")
    inputs = (basepair,three_prime)
    coordinate = np.array(coordinate)

    if np.any( [len(a.shape) > 1 for a in inputs] ):
        raise ValueError("One or more of the input lists has the wrong dimensionality")

    if len(coordinate.shape) != 2:
        raise ValueError("Coordinate array has the wrong dimensionality")

    inputs = (coordinate,basepair,three_prime)
    if not np.all(np.diff([len(a) for a in inputs]) == 0):
        raise ValueError("Inputs are not the same length")

    num_nt = len(basepair)
    if sequence is not None and len(sequence) != num_nt:
        raise ValueError("The 'sequence' parameter is the wrong length {} != {}".format(len(sequence),num_nt))

    if orientation is not None:
        orientation = np.array(orientation)
        if len(orientation.shape) != 3:
            raise ValueError("The 'orientation' array has the wrong dimensionality (should be Nx3x3)")
        if orientation.shape != (num_nt,3,3):
            raise ValueError("The 'orientation' array is not properly formatted")

    if stack is None:
        if orientation is not None:
            stack = find_stacks(coordinate, orientation)
        else:
            ## Guess stacking based on 3' connectivity
            stack = np.array(three_prime,dtype=int) # Assume nts on 3' ends are stacked
            _stack_below = _three_prime_list_to_five_prime(stack)
            _has_bp = (basepair >= 0)
            _nostack = np.where( (stack == -1)*_has_bp )[0]
            _has_stack_below = _stack_below[basepair[_nostack]] >= 0
            _nostack2 = _nostack[_has_stack_below]
            stack[_nostack2] = basepair[_stack_below[basepair[_nostack2]]]

    else:
        try:
            stack = np.array(stack,dtype=int)
        except:
            raise TypeError("The 'stack' array could not be converted into a numpy integer array")

        if len(stack.shape) != 1:
            raise ValueError("The 'stack' array has the wrong dimensionality")

        if len(stack) != num_nt:
            raise ValueError("The length of the 'stack' array does not match other inputs")

    bps = basepair              # alias

    """ Fix stacks: require that the stack of a bp of a base's stack is its bp """
    _has_bp =  (bps >= 0)
    _has_stack = (stack >= 0)
    _stack_has_basepair = (bps[stack] >= 0) * _has_stack
    stack = np.where( (stack[bps[stack]] == bps) * _has_bp * _has_stack * _has_bp,
                      stack, -np.ones(len(stack),dtype=int) )

    five_prime = _three_prime_list_to_five_prime(three_prime)

    """ Build map of dsDNA helices and strands """
    hmap,hrank,fwd,intrahelical = basepairs_and_stacks_to_helixmap(bps,stack,rank)
    double_stranded_helices = np.unique(hmap[hmap >= 0])    
    strands, strand_is_circular = _primes_list_to_strands(three_prime, five_prime)

    """ Add ssDNA to hmap """
    if len(double_stranded_helices) > 0:
        hid = double_stranded_helices[-1]+1
    else:
        hid = 0
    ss_residues = hmap < 0
    #
    if np.any(bps[ss_residues] != -1):
        logger.warning(f'{np.sum(bps[ss_residues] != -1)} ssDNA nucleotides appear to have basepairs... ignoring')

    for s,c in zip(strands, strand_is_circular):
        strand_segment_ends = [i for i in np.where( np.diff(hmap[s]) != 0 )[0]] + [len(s)-1]
        seg_start = 0
        for i in strand_segment_ends:
            if hmap[s[i]] < 0:
                ## Found single-stranded segment
                ids = s[seg_start:i+1]
                assert( np.all(hmap[ids] == -1) )
                hmap[ids] = hid
                hrank[ids] = np.arange(i+1-seg_start)
                hid+=1
            seg_start = i+1

    if len(double_stranded_helices) > 0:
        single_stranded_helices = np.arange(double_stranded_helices[-1]+1,hid)
    else:
        single_stranded_helices = np.arange(hid)

    devlogger.info(f"Creating {len(double_stranded_helices)} dsDNA and {len(single_stranded_helices)} ssDNA segments")

    ## Create double-stranded segments
    doubleSegments = []
    for hid in double_stranded_helices:
        seg = DoubleStrandedSegment(name=str(hid),
                                    num_bp = np.sum(hmap==hid)//2,
                                    _helix_idx = hid)
        set_splines(seg, coordinate, hid, hmap, hrank, fwd, basepair, orientation)
        assert(hid == len(doubleSegments))
        doubleSegments.append(seg)

    ## Create single-stranded segments
    singleSegments = []
    for hid in single_stranded_helices:
        seg = SingleStrandedSegment(name=str(hid),
                                    num_nt = np.sum(hmap==hid),
                                    _helix_idx = hid)
        set_splines(seg, coordinate, hid, hmap, hrank, fwd, basepair, orientation)

        assert(hid == len(doubleSegments) + len(singleSegments))
        singleSegments.append(seg)

    devlogger.debug("Finding crossovers and strand ends")

    ## Find crossovers and 5prime/3prime ends
    crossovers,prime5,prime3 = [[],[],[]]
    for s,c in zip(strands,strand_is_circular):
        tmp = np.where(np.diff(hmap[s]) != 0)[0]
        for i in tmp:
            crossovers.append( (s[i],s[i+1]) )
        if c:
            if hmap[s[-1]] != hmap[s[0]]:
                crossovers.append( (s[-1],s[0]) )
        else:
            prime5.append(s[0])
            prime3.append(s[-1])

    ## Add connections
    allSegments = doubleSegments+singleSegments

    devlogger.debug(f"Adding {len(crossovers)} connections")
    for r1,r2 in crossovers:
        seg1,seg2 = [allSegments[hmap[i]] for i in (r1,r2)]
        nt1,nt2 = [hrank[i] for i in (r1,r2)]
        f1,f2 = [fwd[i] for i in (r1,r2)]

        ## Handle connections at the ends
        is_terminal1 = (nt1,f1) in ((0,0),(seg1.num_nt-1,1))
        is_terminal2 = (nt2,f2) in ((0,1),(seg2.num_nt-1,0))

        # devlogger.info(seg1,seg2, r1, r2, is_terminal1, is_terminal2)
        if is_terminal1 or is_terminal2:
            """ Ensure that we don't have three-way dsDNA junctions """
            if is_terminal1 and (bps[r1] >= 0) and (five_prime[bps[r1]] >= 0) and (three_prime[r1] >= 0):
                if (bps[five_prime[bps[r1]]] >= 0) and (bps[three_prime[r1]] >= 0):
                    # is_terminal1 = (three_prime[r1] == bps[five_prime[bps[r1]]])
                    is_terminal1 = hmap[five_prime[bps[r1]]] == hmap[three_prime[r1]]
            if is_terminal2 and (bps[r2] >= 0) and (three_prime[bps[r2]] >= 0) and (five_prime[r2] >= 0):
                if (bps[three_prime[bps[r2]]] >= 0) and (bps[five_prime[r2]] >= 0):
                    # is_terminal2 = (five_prime[r2] == bps[three_prime[bps[r2]]])
                    is_terminal2 = hmap[three_prime[bps[r2]]] == hmap[five_prime[r2]]

            """ Place connection """
            if is_terminal1 and is_terminal2 and (r1 not in explicitly_unstacked) and (r2 not in explicitly_unstacked):
                end1 = seg1.end3 if f1 else seg1.start3
                end2 = seg2.start5 if f2 else seg2.end5
                seg1._connect_ends( end1, end2, type_='intrahelical')
            else:
                seg1.add_crossover(nt1,seg2,nt2,[f1,f2],type_="terminal_crossover")
        else:
            seg1.add_crossover(nt1,seg2,nt2,[f1,f2])

    devlogger.debug(f"Converting terminal crossovers to regular ones if other crossovers are nearby")
    for seg1 in doubleSegments:
        cAB = seg1.get_connections_and_locations(exclude='intrahelical')
        conns = {}
        for c,A,B in cAB:
            seg2 = B.container
            if seg2 not in conns: conns[seg2] = []
            assert('crossover' in c.type_)
            conns[seg2].append(c)

        nt_pos = {c:A.get_nt_pos() for c,A,B in cAB}
        for c,A,B in cAB:
            if c.type_ != 'terminal_crossover': continue
            seg2 = B.container
            if seg2 not in doubleSegments: continue
            dists = np.abs(np.array([nt_pos[c] - nt_pos[c2] for c2 in conns[seg2]]))
            # devlogger.debug(f'Terminal crossover {c} distance: {dists}')
            if np.sum((dists > 0) & (dists < 75)) > 0:
                # devlogger.debug(f'Fixing crossover at {c,A,B}')
                c.type_ = 'crossover'

    devlogger.debug(f"Adding prime ends and dealing with circular helices")
    ## Add 5prime/3prime ends
    for r in prime5:
        seg = allSegments[hmap[r]]
        seg.add_5prime(hrank[r],fwd[r])
    for r in prime3:
        seg = allSegments[hmap[r]]
        seg.add_3prime(hrank[r],fwd[r])

    ## Add intrahelical connections to circular helical sections
    for nt0,nt1 in intrahelical:
        seg = allSegments[hmap[nt0]]
        assert( seg is allSegments[hmap[nt1]] )
        if three_prime[nt0] >= 0:
            if hmap[nt0] == hmap[three_prime[nt0]]:
                seg.connect_end3(seg.start5)

        bp0,bp1 = [bps[nt] for nt in (nt0,nt1)]
        if three_prime[bp1] >= 0:
            if hmap[bp1] == hmap[three_prime[bp1]]:
                seg.connect_start3(seg.end5)

    devlogger.debug(f"Adding sequence")
    ## Assign sequence
    if sequence is not None:
        for hid in range(len(allSegments)):
            resids = np.where( (hmap==hid)*(fwd==1) )[0]
            s = allSegments[hid]
            s.sequence = [sequence[r] for r in sorted(resids,key=lambda x: hrank[x])]


    devlogger.debug(f"Building model")
    ## Build model
    model = SegmentModel( allSegments,
                          max_basepairs_per_bead = max_basepairs_per_bead,
                          max_nucleotides_per_bead = max_nucleotides_per_bead,
                          local_twist = local_twist,
                          dimensions = dimensions,
                          **model_parameters )


    devlogger.debug(f"Adding _reader_list attributes and randomizing unset sequences")
    model._reader_list_coordinates = coordinate
    model._reader_list_basepair = basepair
    model._reader_list_stack = stack
    model._reader_list_three_prime = three_prime
    model._reader_list_five_prime = five_prime
    model._reader_list_sequence = sequence
    model._reader_list_orientation = orientation
    model._reader_list_hmap = hmap
    model._reader_list_fwd = fwd
    model._reader_list_hrank = hrank
    if sequence is None:
        for s in model.segments:
            s.randomize_unset_sequence()
    return model