f

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
{- TODO ByteString and Attoparsec instead of String and Parsec -}
{- TODO Complain about stupid data? eg. protein seq if DNA Fasta -}

module ParseFasta
    ( parseFasta   -- * generic parser for single FASTA block
    , parseFastaM  -- * generic parser for multiple FASTA blocks
    , parseFastaM' -- * Fasta ADT, list of (defline, sequence) tuples
    ) 
  where

{-# LANGUAGE OverloadedStrings #-}

import Text.Parsec
import Text.Parsec.Char (digit, letter)
import Control.Applicative hiding ((<|>))
import System.Environment (getArgs)
import qualified Data.ByteString.Lazy as L
import Data.Char (toUpper)
import qualified Data.Map (fromList)


-- | Defline, required
defline1 :: Parsec String st Char
defline1 = spaces *> char '>' <|> char ';' -- noone uses ; anymore

defline2 :: Parsec String st String
defline2 = many (noneOf "\r\n") <* newline

defline = defline1 *> defline2 <?> "Parse error on defline"


-- | Sequence

-- must begin with a character

parseXNA1 :: Parsec String st String
parseXNA1 =   spaces
           *> many letter <* spaces
          <|> many letter <* newline


parseXNA2 :: Parsec String st [String]
parseXNA2 = sepBy1 (many letter)
              (many1 (char ' ') <|>  many1 newline) 

parseXNA = mconcat <$> parseXNA2

parseFasta = (\a b -> [a,b]) <$> defline <*> parseXNA

parseFastaM = many1 parseFasta

-- | ADT for storing and pretty printing
--

type Defline = String
type Sequence = String

data Fasta = Fasta { getFasta :: (Defline, Sequence) }

instance Show Fasta where
    show (Fasta (def, seq)) =
         def ++ "\n" ++ (map toUpper seq) ++ "\n"


parseFasta' :: Parsec String st Fasta
parseFasta' = (\a b -> Fasta (a,b)) <$> defline <*> parseXNA

parseFastaM' :: Parsec String st [Fasta]
parseFastaM' = many1 parseFasta'


main = do
    file <- readFile "fasta.txt"
    let parsed = parse parseFastaM' "" file
    print $ (\(Right x) -> x) parsed