Amino Acid table: graphed

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
{-# LANGUAGE ViewPatterns #-}

module Genes.AminoAcid where

{--
Continuation of the solution for sequencing amino acids started with Data.Bases
at http://lpaste.net/107069 ... here we map amino acids to their corresponding
nucleotide triples.
--}

import Control.Arrow
import Data.Array
import Data.Char
import Data.Foldable (toList)
import Data.List ((\\))
import Data.Map (Map)
import qualified Data.Map as Map
import Data.Maybe (mapMaybe, fromJust, fromMaybe)
import Data.Set (Set)
import qualified Data.Set as Set

import Genes.Bases                 -- http://lpaste.net/107069

data AcidName = A | C | D | E | F | G | H | I | K | L
              | M | N | P | Q | R | S | T | V | W | Y
   deriving (Eq, Ord, Enum, Bounded, Ix, Show, Read)

data AminoAcid = Acid AcidName (Set NucleotideTriple)
   deriving (Eq, Ord)

instance Show AminoAcid where
   show (Acid name triples) =
        show name ++ " : " ++ show (Set.toList triples)

parseAcidDefs :: FilePath -> IO [AminoAcid]
parseAcidDefs = fmap (map parseAcid . lines) . readFile

-- so the above is one way to do it: import the CODON table from a file.
-- Another way to do it is to define the table internally and use parseAcid.

acidDefs :: [String]
acidDefs = ["a: gct gcc gca gcg","c: tgt tgc","d: gat gac","e: gaa gag",
            "f: ttt ttc","g: ggt ggc gga ggg","h: cat cac","i: att atc ata",
            "k: aaa aag","l: tta ttg ctt ctc cta ctg","m: atg","n: aat aac",
            "p: cct ccc cca ccg","q: caa cag","r: cgt cgc cga cgg aga agg",
            "s: tct tcc tca tcg agt agc","t: act acc aca acg",
            "v: gtt gtc gta gtg","w: tgg","y: tat tac"]

parseAcid :: String -> AminoAcid
parseAcid (words . map toUpper -> ((name:_):rest)) =
   Acid (read $ return name) (Set.fromList $ map read rest)

-- *Genes.AminoAcid> map parseAcid acidDefs ~> [[A : [GCA,GCT,GCC,GCG], ...]

-- start is ATG. I know this

start :: NucleotideTriple
start = read "ATG"

-- stop is merely the dual of the intersection of all possible nucleotide
-- triples to the ones that map to acids (aka 'list difference')

stop :: [(AcidName, NucleotideTriple)] -> [NucleotideTriple]
stop = (allTriples \\) . map snd

-- *Genes.AminoAcid> stop (sparse acids) ~> [TAA,TAG,TGA]

stops :: [NucleotideTriple]
stops = stop (sparse (map parseAcid acidDefs))

-- to use stop, we need to simplify the Acid-values to tuples:

sparse :: [AminoAcid] -> [(AcidName, NucleotideTriple)]
sparse = concatMap (sequence . second toList . simplify)

simplify :: AminoAcid -> (AcidName, Set NucleotideTriple)
simplify (Acid n s) = (n, s)

data CODONTable =
   Table (Map NucleotideTriple AcidName) (Map AcidName (Set NucleotideTriple))

data AminoCode = AC AcidName | Stop
   deriving (Eq, Ord, Show)

uncode :: AminoCode -> AcidName
uncode = fromJust . code2Maybe

code2Maybe :: AminoCode -> Maybe AcidName
code2Maybe (AC n) = pure n
code2Maybe Stop = Nothing

-- uncode of Stop is an error

-- now we need to de-sparse the sparseness ... or we can just use the
-- AminoAcid set ...

codonTable :: [AminoAcid] -> CODONTable
codonTable = 
   map simplify                                           >>>
   map swap . concatMap (sequence . second toList) &&& id >>>
   Map.fromList *** Map.fromList                          >>>
   uncurry Table

swap :: (a,b) -> (b,a)
swap = snd &&& fst

codons :: CODONTable
codons = codonTable (map parseAcid acidDefs)

-- Given that we have the codon table, let's extract triples from acids 
-- and vice versa

triples :: CODONTable -> AcidName -> Set NucleotideTriple
triples (Table _ m) = (m Map.!)

-- *Genes.Proteins> triples tab D ~> {GAT,GAC}

acid :: CODONTable -> NucleotideTriple -> AminoCode
acid (Table m _) = maybe Stop AC . (`Map.lookup` m)

-- *Genes.Proteins> acid tab (read "TAT") ~> AC Y
-- *Genes.Proteins> acid tab (read "TAA") ~> Stop

-- The dual: looking up the triples from the amino acid name.

trips :: CODONTable -> AcidName -> Set NucleotideTriple
trips (Table _ m) = fromMaybe Set.empty . (`Map.lookup` m)

-- And with that, we have the first step to gene sequencing.

-- the below is to solve what triples compose a protein
-- a dynamic-programming exercise

target :: String -> CODONTable -> [[NucleotideTriple]]
target acid (Table _ names) = 
   let aminoAcidNames = map (read . return . toUpper) acid
       aminoAcids = mapMaybe (`Map.lookup` names) aminoAcidNames
   in  retargeted $ map Set.toList aminoAcids 

{--
But the function targeted is the wrong orientation we want M x N not N x M
so we have to flip/redistribute the triples.
--}

retargeted :: [[NucleotideTriple]] -> [[NucleotideTriple]]
retargeted [] = [[]]
retargeted (trips:rest) = trips >>= (`map` retargeted rest) . (:)
  -- n.b. "trips >>= for (retargeted rest) . (:)" loops. Why?

go :: String -> [[NucleotideTriple]]
go acid = target acid codons

-- acids.defs can be found at http://lpaste.net/raw/3763369603112108032

{--
go "kmspdw" ~> [[AAA,ATG,AGT,CCA,GAT,TGG], ... 95 others]

Cool beans!

Other samples:

go "abdd" ~> error [b is not an acid]

go "acddtci" ~> [768 sequences] ... wow!
--}

{--
Notes while solving this exercise:

1 and 2. Given Data.Bases and Data.AminoAcid:

As we already proved that there is at most one amino acid defined by
a nucleotide triple, we improve the type-signature of the acid function
to reflect this semideterminacy

3. 4 * 4 * 4 == 64 

or 

let bases [A, G, T, C] 
in  length [Triple (a, b, c) | a <- bases, b <- bases, c <- bases]

4. solution provided by the go function or target function
--}