RosettaCodeData/Task/Bioinformatics-Sequence-mut.../Haskell/bioinformatics-sequence-mut...

81 lines
3.1 KiB
Haskell

import Data.List (group, sort)
import Data.List.Split (chunksOf)
import System.Random (Random, randomR, random, newStdGen, randoms, getStdRandom)
import Text.Printf (PrintfArg(..), fmtChar, fmtPrecision, formatString, IsChar(..), printf)
data Mutation = Swap | Delete | Insert deriving (Show, Eq, Ord, Enum, Bounded)
data DNABase = A | C | G | T deriving (Show, Read, Eq, Ord, Enum, Bounded)
type DNASequence = [DNABase]
data Result = Swapped Mutation Int (DNABase, DNABase)
| InsertDeleted Mutation Int DNABase
instance Random DNABase where
randomR (a, b) g = case randomR (fromEnum a, fromEnum b) g of (x, y) -> (toEnum x, y)
random = randomR (minBound, maxBound)
instance Random Mutation where
randomR (a, b) g = case randomR (fromEnum a, fromEnum b) g of (x, y) -> (toEnum x, y)
random = randomR (minBound, maxBound)
instance PrintfArg DNABase where
formatArg x fmt = formatString (show x) (fmt { fmtChar = 's', fmtPrecision = Nothing })
instance PrintfArg Mutation where
formatArg x fmt = formatString (show x) (fmt { fmtChar = 's', fmtPrecision = Nothing })
instance IsChar DNABase where
toChar = head . show
fromChar = read . pure
chunkedDNASequence :: DNASequence -> [(Int, [DNABase])]
chunkedDNASequence = zip [50,100..] . chunksOf 50
baseCounts :: DNASequence -> [(DNABase, Int)]
baseCounts = fmap ((,) . head <*> length) . group . sort
newSequence :: Int -> IO DNASequence
newSequence n = take n . randoms <$> newStdGen
mutateSequence :: DNASequence -> IO (Result, DNASequence)
mutateSequence [] = fail "empty dna sequence"
mutateSequence ds = randomMutation >>= mutate ds
where
randomMutation = head . randoms <$> newStdGen
mutate xs m = do
i <- randomIndex (length xs)
case m of
Swap -> randomDNA >>= \d -> pure (Swapped Swap i (xs !! pred i, d), swapElement i d xs)
Insert -> randomDNA >>= \d -> pure (InsertDeleted Insert i d, insertElement i d xs)
Delete -> pure (InsertDeleted Delete i (xs !! pred i), dropElement i xs)
where
dropElement i xs = take (pred i) xs <> drop i xs
insertElement i e xs = take i xs <> [e] <> drop i xs
swapElement i a xs = take (pred i) xs <> [a] <> drop i xs
randomIndex n = getStdRandom (randomR (1, n))
randomDNA = head . randoms <$> newStdGen
mutate :: Int -> DNASequence -> IO DNASequence
mutate 0 s = pure s
mutate n s = do
(r, ms) <- mutateSequence s
case r of
Swapped m i (a, b) -> printf "%6s @ %-3d : %s -> %s \n" m i a b
InsertDeleted m i a -> printf "%6s @ %-3d : %s\n" m i a
mutate (pred n) ms
main :: IO ()
main = do
ds <- newSequence 200
putStrLn "\nInitial Sequence:" >> showSequence ds
putStrLn "\nBase Counts:" >> showBaseCounts ds
showSumBaseCounts ds
ms <- mutate 10 ds
putStrLn "\nMutated Sequence:" >> showSequence ms
putStrLn "\nBase Counts:" >> showBaseCounts ms
showSumBaseCounts ms
where
showSequence = mapM_ (uncurry (printf "%3d: %s\n")) . chunkedDNASequence
showBaseCounts = mapM_ (uncurry (printf "%s: %3d\n")) . baseCounts
showSumBaseCounts xs = putStrLn (replicate 6 '-') >> printf "Σ: %d\n\n" (length xs)