117 lines
3.9 KiB
Python
117 lines
3.9 KiB
Python
import os
|
|
|
|
from collections import Counter
|
|
from functools import reduce
|
|
from itertools import permutations
|
|
|
|
BASES = ("A", "C", "G", "T")
|
|
|
|
|
|
def deduplicate(sequences):
|
|
"""Return the set of sequences with those that are a substring
|
|
of others removed too."""
|
|
sequences = set(sequences)
|
|
duplicates = set()
|
|
|
|
for s, t in permutations(sequences, 2):
|
|
if s != t and s in t:
|
|
duplicates.add(s)
|
|
|
|
return sequences - duplicates
|
|
|
|
|
|
def smash(s, t):
|
|
"""Return `s` concatenated with `t`. The longest suffix of `s`
|
|
that matches a prefix of `t` will be removed."""
|
|
for i in range(len(s)):
|
|
if t.startswith(s[i:]):
|
|
return s[:i] + t
|
|
return s + t
|
|
|
|
|
|
def shortest_superstring(sequences):
|
|
"""Return the shortest superstring covering all sequences. If
|
|
there are multiple shortest superstrings, an arbitrary
|
|
superstring is returned."""
|
|
sequences = deduplicate(sequences)
|
|
shortest = "".join(sequences)
|
|
|
|
for perm in permutations(sequences):
|
|
superstring = reduce(smash, perm)
|
|
if len(superstring) < len(shortest):
|
|
shortest = superstring
|
|
|
|
return shortest
|
|
|
|
|
|
def shortest_superstrings(sequences):
|
|
"""Return a list of all shortest superstrings that cover
|
|
`sequences`."""
|
|
sequences = deduplicate(sequences)
|
|
|
|
shortest = set(["".join(sequences)])
|
|
shortest_length = sum(len(s) for s in sequences)
|
|
|
|
for perm in permutations(sequences):
|
|
superstring = reduce(smash, perm)
|
|
superstring_length = len(superstring)
|
|
if superstring_length < shortest_length:
|
|
shortest.clear()
|
|
shortest.add(superstring)
|
|
shortest_length = superstring_length
|
|
elif superstring_length == shortest_length:
|
|
shortest.add(superstring)
|
|
|
|
return shortest
|
|
|
|
|
|
def print_report(sequence):
|
|
"""Writes a report to stdout for the given DNA sequence."""
|
|
buf = [f"Nucleotide counts for {sequence}:\n"]
|
|
|
|
counts = Counter(sequence)
|
|
for base in BASES:
|
|
buf.append(f"{base:>10}{counts.get(base, 0):>12}")
|
|
|
|
other = sum(v for k, v in counts.items() if k not in BASES)
|
|
buf.append(f"{'Other':>10}{other:>12}")
|
|
|
|
buf.append(" " * 5 + "_" * 17)
|
|
buf.append(f"{'Total length':>17}{sum(counts.values()):>5}")
|
|
|
|
print(os.linesep.join(buf), "\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_cases = [
|
|
("TA", "AAG", "TA", "GAA", "TA"),
|
|
("CATTAGGG", "ATTAG", "GGG", "TA"),
|
|
("AAGAUGGA", "GGAGCGCAUC", "AUCGCAAUAAGGA"),
|
|
(
|
|
"ATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTAT",
|
|
"GGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGT",
|
|
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
|
|
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
|
|
"AACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTT",
|
|
"GCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTC",
|
|
"CGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATTCTGCTTATAACACTATGTTCT",
|
|
"TGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
|
|
"CGTAAAAAATTACAACGTCCTTTGGCTATCTCTTAAACTCCTGCTAAATGCTCGTGC",
|
|
"GATGGAGCGCATCGAACGCAATAAGGATCATTTGATGGGACGTTTCGTCGACAAAGTCTTGTTTCGAGAGTAACGGCTACCGTCTTCGATT",
|
|
"TTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGATGGAGCGCATC",
|
|
"CTATGTTCTTATGAAATGGATGTTCTGAGTTGGTCAGTCCCAATGTGCGGGGTTTCTTTTAGTACGTCGGGAGTGGTATTATA",
|
|
"TCTCTTAAACTCCTGCTAAATGCTCGTGCTTTCCAATTATGTAAGCGTTCCGAGACGGGGTGGTCGATTCTGAGGACAAAGGTCAAGA",
|
|
),
|
|
]
|
|
|
|
for case in test_cases:
|
|
for superstring in shortest_superstrings(case):
|
|
print_report(superstring)
|
|
|
|
# or ..
|
|
#
|
|
# for case in test_cases:
|
|
# print_report(shortest_superstring(case))
|
|
#
|
|
# .. if you don't want all possible shortest superstrings.
|