Bases to your Acids

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
module Genes.Bases where

{--
The below solution for the exercise at http://lpaste.net/107023 that was (more
than) a bit of a challenge is expressed in two separate modules: Genens.Bases and
Genes.AminoAcid. We present both in two lpaste files, the first one has a link 
to the second at the bottom of the file as a comment.
--}

import Control.Comonad
import Control.Monad.State
import Data.Array
import Data.Char

import Data.Random               -- http://lpaste.net/3547465428253016064

data Base = A | T | C | G
   deriving (Eq, Ord, Enum, Bounded, Ix, Show, Read)

allBases :: [Base]
allBases = [A ..]

data NucleotideTriple = Triple (Base, Base, Base)
   deriving (Eq, Ord)

allTriples :: [NucleotideTriple]
allTriples = map Triple [(a,b,c) | a <- allBases, b <- allBases, c <- allBases]

{-- 
For the read instance of a triple, we will specialize on the formatting
we are give which is, e.g.: "tgg"
--}

instance Show NucleotideTriple where
   show (Triple (a, b, c)) = map (head . show) [a, b, c]

mkTrip :: [Base] -> NucleotideTriple
mkTrip [a,b,c] = Triple (a,b,c)

instance Read NucleotideTriple where
   readsPrec prec = readParen False 
     (\r -> [(mkTrip bases, rest) |
                (trechars, rest) <- lex r,
                let bases = map (read . return . toUpper) trechars])

-- so, let triple = Triple (T, G, G) in read (show triple) == triple is True

-- Genes.AminoAcid, the continuation to the solution, is located at
-- http://lpaste.net/107071

-- Now, let's talk generating Gene Sequences from 1HAD 2016-02-12:

data GeneSequence = GS [Base]

instance Show GeneSequence where
  show (GS seq) = map (head . show) seq

-- exercise-for-some-other-day: what is the Read instance of GeneSequence?

rndGeneSequence :: Monad m => Int -> StateT (RNG Int) m GeneSequence
rndGeneSequence = liftM (GS . map (toEnum . (`mod` 4))) . someRNDs

-- ... return a really long one, like more than 1,000 nucleotides ... we'll
-- be sequencing it for a future exercise, don't you know.

-- (So, yah, I just said, 'return a random gene sequence, but make sure it's
-- long') (deal with the conundrum.)

-- geophf, n.: heart-breaker, dream wrecker, train crasher, conundrum maker.

{--
*Main> rndSeed >>= evalStateT (rndGeneSequence 1000) ~>
AAATGTATGCCCTGAGTGCCGTTCTTGATGCTGTTGATCGCGGATTGCGTGACCCCCTGAGAAATTATTCGGACGTG
AATGTATCGCACCGTTGAGGGCGACACTCCCTGATGTTTTTTGGGAATCATGCAATTAAACATCAGAGCTTGCTGCA
CCGTCAGCAGTGATTTTGTGGTTCTGCCGCACTATGCCAGTTTGAGCCTGTTTGATATGGAGCACTATTGAGTAAAG
GGACTTTCTGTTTGTGAGAACTTCACCTTGGTCGCGCCAACGCGAGCACGTTCGAAATACTAATCACCATCTCTGTG
TGTATTGATGGTACCCCATATACTGTTGATCAGCGCAGACGTACAGCGCGCTTCTACAATAAACTTGTAGGTATGCT
CTCCAAAAAAAGACTAATTTGCCGATTTCCGCTTTTGCGAGACACTTGAGTAAGAATAGGCTCACAGTACGCAACCG
ACGAAAGATGTCTGGTATGCGAAATCCCCTCTGATTGCGGCATTGGGGATCATATAGGAGCATGAGAATCAATTTTT
CCACAGGAACGCGTATGGCATAAACTCTCATCCATCGATACGCCAGCATTACACGATGGACTGAACAATTACCCAGC
CAACTTGGAGTTCTCCCGGACGACAAATCTTAGGTCCGAGCGAGACGCTCAGTTATAAAATCATGTATAACCCCTGA
ACGTCCTTGCGATCGCTTGCCCCGCAAATTATCATTTATGTTCGTAGTAATGGGCGTGGATAGTCGTCTGTATTCTG
CAAGGTCGCTGCCATACAAAGGATTAAGAGTTGTGTTAGATCGCGATTAACGTAGGCTTGGAGACGCACCGAGGAGT
ATTGACCTGGACGGGTATCGCGTCGGAAGACCGCACGCTCGACGAGCTATCCTTTTTGGTGCCGTGATCTGATTAGA
GTACGGTCCAGGCAAATTGGGGGAAACGCAAGGGTTTACAAGACGGGCGATCGAGCGCATAATTATAGCAGTAACC

Okay, now let's separate the above into nucleotide triples.

Here's the thing: in a strand, you don't know where you're 'starting.' So you 
don't know what the triples are, so, say you have this strand:

... GCTCAGATGTTAGAAGGAGATTAGGCGTCAGAAGCTAAGGGTATTCCAGA ...

Where is it's 'beginning'? You don't know. So the leading triples here could be:

[GCT,CAG,ATG,...]  (obviously)

OR, they could be:

[CTC,AGA,TGT, ...] (drop 1)

OR, they could be:

[TCA,GAT,GTT, ...] (drop 2)

So let's do that. How? Comonads, obviously!

First we make the bases of a gene sequence a comonad. We can do this because
we 'know' that a gene sequence is never empty. 'Know' is a technical term:
it means 'know.'
--}

instance Comonad [] where
   extract = head
   duplicate [] = []
   duplicate list = list : duplicate (tail list)

-- Now we chunk-n-dunk? Nah: let's slice-n-dice, instead!

strands :: GeneSequence -> [[NucleotideTriple]]
strands (GS genes) = take 3 (genes =>> slicer)

slicer :: [Base] -> [NucleotideTriple]
slicer (a:b:c:rest) = Triple (a,b,c) : slicer rest
slicer _            = []

{--
So, for the above set of bases:

*Main> let bases = "GCTCAGATGTTAGAAGGAGATTAGGCGTCAGAAGCTAAGGGTATTCCAGA"
*Main> strands (GS $ map (read . return) bases) ~>
[[GCT,CAG,ATG,TTA,GAA,GGA,GAT,TAG,GCG,TCA,GAA,GCT,AAG,GGT,ATT,CCA],
 [CTC,AGA,TGT,TAG,AAG,GAG,ATT,AGG,CGT,CAG,AAG,CTA,AGG,GTA,TTC,CAG],
 [TCA,GAT,GTT,AGA,AGG,AGA,TTA,GGC,GTC,AGA,AGC,TAA,GGG,TAT,TCC,AGA]]

Okay, now let's generate a random set of bases:

*Main> rndSeed >>= evalStateT (rndGeneSequence 1000) ~> 
TTTCACAGAAAGCGTTAGGGGAGGGCCAGGCGTCTTTGAGTGTTCATTTATTC ...

*Main> strands bases ~> 
[[TTT,CAC,AGA,AAG,CGT,TAG,GGG,AGG,GCC,AGG,CGT,CTT,TGA, ...], ...]
--}

-- How about the reverse complement?
 
reverseComplement :: GeneSequence -> GeneSequence
reverseComplement (GS bases) = GS (map complement (reverse bases))

complement :: Base -> Base
complement A = T
complement T = A
complement G = C
complement C = G