tabulate.hs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
{-# LANGUAGE BangPatterns #-}
import qualified System.FilePath.Glob as Glob
import System.Environment
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.UTF8 as U
import Data.Char
import Data.List
import qualified Data.Set as Set

data Stat = Stat {
    stChars    :: !Int,
    stVowels   :: !Int,
    stPairEL   :: !Int,
    stWords    :: !Int
}
instance Show Stat where
  show (Stat stChars stVowels stPairEL stWords) =
    "Characters: " ++ show stChars ++ "\n" ++
    "Vowels: " ++ show stVowels ++ "\n" ++
    "EL pairs: " ++ show stPairEL ++ "\n" ++
    "Words: " ++ show stWords ++ "\n"
defaultStat = Stat 0 0 0 0
combineStat :: Stat -> Stat -> Stat
combineStat
  (Stat s1Chars s1Vowels s1PairEL s1Words)
  (Stat s2Chars s2Vowels s2PairEL s2Words) =
  Stat
    (s1Chars + s2Chars)
    (s1Vowels + s2Vowels)
    (s1PairEL + s2PairEL)
    (s1Words + s2Words)


main = do
  argv <- getArgs
  case argv of
    [] -> print "Need to specify language code (e.g. \"es\")"
    [lang] -> processLang lang

processLang :: String -> IO ()
processLang lang = do
  files <- getFileList lang
  results <- mapM tabulateFile files
  print $ foldl' combineStat defaultStat results
--  mapM_ (\x -> do
--            results <- tabulateFile x
--            putStrLn $ show x ++ "\n" ++ (show results))
--    files

--tabulateAndPrint file = do
--  results <- tabulateFile file
--  print file ++ "\n" ++ (show results)

getFileList :: String -> IO ([FilePath])
getFileList lang = do
  let patt = Glob.compile "*.txt"
  Glob.globDir1 patt (lang++"/")

tabulateText :: Char -> L.ByteString -> Stat -> Stat
tabulateText lastChr bs stat =
  case U.uncons bs of
    Nothing -> stat
    Just (chr, newBs) ->
      tabulateText lchr newBs (countChar lastChr lchr stat)
        where lchr = toLower chr

{-# INLINE countChar #-}
countChar :: Char -> Char -> Stat -> Stat
countChar !lastChr !chr !(Stat stChars stVowels stPairEL stWords) =
  Stat
    (stChars  + 1)
    (stVowels + (countIf $ isVowel chr))
    (stPairEL + (countIf (lastChr == 'e' && chr == 'l')))
    (stWords  + (countIf ((not $ isLetter lastChr) && isLetter chr)))

tabulateFile :: FilePath -> IO Stat
tabulateFile path = do
  putStrLn path
  contents <- L.readFile path
  return $! tabulateText ' ' contents defaultStat


-- Checks if a Char is a vowel. Only works for lowercase letters.
-- This list must be in order for fromAscList to work properly
vowels = Set.fromAscList ['a', 'e', 'i', 'o', 'u',
  'à',
  '\xe0', --SMALL LETTER A WITH GRAVE
  '\xe1', --SMALL LETTER A WITH ACUTE
  '\xe2', --SMALL LETTER A WITH CIRCUMFLEX
  '\xe3', --SMALL LETTER A WITH TILDE
  '\xe4', --SMALL LETTER A WITH DIAERESIS
  '\xe5', --SMALL LETTER A WITH RING ABOVE
  '\xe6', --SMALL LETTER AE
  '\xe8', --SMALL LETTER E WITH GRAVE
  '\xe9', --SMALL LETTER E WITH ACUTE
  '\xea', --SMALL LETTER E WITH CIRCUMFLEX
  '\xeb', --SMALL LETTER E WITH DIAERESIS
  '\xec', --SMALL LETTER I WITH GRAVE
  '\xed', --SMALL LETTER I WITH ACUTE
  '\xee', --SMALL LETTER I WITH CIRCUMFLEX
  '\xef', --SMALL LETTER I WITH DIAERESIS
  '\xf1', --SMALL LETTER N WITH TILDE
  '\xf2', --SMALL LETTER O WITH GRAVE
  '\xf3', --SMALL LETTER O WITH ACUTE
  '\xf4', --SMALL LETTER O WITH CIRCUMFLEX
  '\xf5', --SMALL LETTER O WITH TILDE
  '\xf6', --SMALL LETTER O WITH DIAERESIS
  '\xf8', --SMALL LETTER O WITH STROKE
  '\xf9', --SMALL LETTER U WITH GRAVE
  '\xfa', --SMALL LETTER U WITH ACUTE
  '\xfb', --SMALL LETTER U WITH CIRCUMFLEX
  '\xfc', --SMALL LETTER U WITH DIAERESIS
  '\xfd', --SMALL LETTER Y WITH ACUTE
  '\xff'  --SMALL LETTER Y WITH DIAERESIS
  ]

--vowels = ['a', 'e', 'i', 'o', 'u']
isVowel :: Char -> Bool
isVowel c = Set.member c vowels
--isVowel _ = True
--isVowel 'a' = True
--isVowel 'e' = True
--isVowel 'i' = True
--isVowel 'o' = True
--isVowel 'u' = True
--isVowel _ = False
--isVowel c = elem c vowels
--isVowel c = isLetter c && V.elem c vowels
  
countIf :: Bool -> Int
countIf True = 1
countIf False = 0
54:29: Error: Redundant bracket
Found:
([FilePath])
Why not:
[FilePath]
72:6: Warning: Redundant $
Found:
stVowels + (countIf $ isVowel chr)
Why not:
stVowels + countIf (isVowel chr)
73:6: Warning: Redundant bracket
Found:
stPairEL + (countIf (lastChr == 'e' && chr == 'l'))
Why not:
stPairEL + countIf (lastChr == 'e' && chr == 'l')
74:6: Warning: Redundant bracket
Found:
stWords + (countIf ((not $ isLetter lastChr) && isLetter chr))
Why not:
stWords + countIf ((not $ isLetter lastChr) && isLetter chr)
74:27: Warning: Redundant $
Found:
(not $ isLetter lastChr) && isLetter chr
Why not:
not (isLetter lastChr) && isLetter chr
85:26: Warning: Use string literal
Found:
['a', 'e', 'i', 'o', 'u', '\224', '\224', '\225', '\226', '\227',
'\228', '\229', '\230', '\232', '\233', '\234', '\235', '\236',
'\237', '\238', '\239', '\241', '\242', '\243', '\244', '\245',
'\246', '\248', '\249', '\250', '\251', '\252', '\253', '\255']
Why not:
"aeiou\224\224\225\226\227\228\229\230\232\233\234\235\236\237\238\239\241\242\243\244\245\246\248\249\250\251\252\253\255"