Scanning CSV files

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
module Control.Scan.CSV where

{--
A set of functions around tokenizing/tokenising/rending lists and then patching
them back up together again/weaving/intercalating <-- smh: 'intercalate'? really?
--}

import Control.Arrow
import Data.List (intercalate)

import Control.Presentation       -- http://lpaste.net/588030780018524160

-- I do this often enough: scanning in a CSV file, so, here it is:

-- your words function with a 'by'-argument. Should be there already ...

-- note something interesting about wordsBy: nowhere is this list necessarily
-- a list of characters. We can generalize this.

wordsBy :: String -> String -> [String]
wordsBy [char] = rend char
wordsBy delims = rendBy (`elem` delims)

{-- e.g.:

*Main> wordsBy "|" "REQMT209||" ~> ["REQMT209"]

*Control.Scan.CSV> let line = "REQMT207|REQMT228, REQMT57|REQMT228, REQMT57"
*Control.Scan.CSV> wordsBy "|" line ~>
["REQMT207","REQMT228, REQMT57","REQMT228, REQMT57"]

*Control.Scan.CSV> wordsBy "|," line ~>
["REQMT207","REQMT228"," REQMT57","REQMT228"," REQMT57"]

Note the leading spaces, so it should be:

*Control.Scan.CSV> wordsBy "|, " line ~>
["REQMT207","REQMT228","REQMT57","REQMT228","REQMT57"]

An alternate implementation that replaces the original hand-coded wordsBy:
--}

rend :: Eq a => a -> [a] -> [[a]] -- practice shows 1 delimiter only
rend delim = rendBy (== delim)

rendBy :: (a -> Bool) -> [a] -> [[a]]
rendBy fn [] = []
rendBy fn line@(h:t) =
    if fn h then ([]:rendBy fn t)  -- to keep columns aligned
    else (second (rendBy fn . softtail) >>> uncurry (:)) $ break fn line

softtail :: [a] -> [a]
softtail [] = []
softtail (h:t) = t

-- *Main> rend ',' "1,2,3" ~> ["1","2","3"]

csv :: String -> [String]
-- csv = rend ',' -- so we could just write this.

-- No, we need to make csv "-escape-aware

csv "" = []
csv str = (rend ',' *** escapedThenCSVd >>> uncurry (++)) (break (== '"') str)

escapedThenCSVd "" = []
escapedThenCSVd str =
   (second (csv . softtail . softtail) >>> uncurry (:)) (break (== '"') (tail str))

{--
*Main> csv "1,2,3" ~> ["1","2","3"]
*Main> csv "\"1,2\",3" ~> ["1,2","3"]
*Main> csv "\"1,2\",3,4,5,\"6,7,8\",9,10" ~> ["1,2","3","4","5","6,7,8","9","10"]
*Main> csv "\"1,2\",3,4,5,\"6,7,8\",\"9,10\"" ~> ["1,2","3","4","5","6,7,8","9,10"]

There ya go! Quote escaping csv-parser. Yay.
--}

-- Straight from Prelude.unwords using 'sep' instead of spaces as separators.
-- isn't this intercalate? In which case (intercalate is just a weird word for
-- a function name, isn't it), then, aren't we weaving the separator into this
-- list of sublists?

unwordsBy :: Char -> [String] -> String
unwordsBy char = intercalate (return char)

uncsv :: Univ a => a -> String
uncsv = unwordsBy ',' . explode

{--
let ans = ["REQMT207","REQMT228","REQMT57","REQMT228","REQMT57"]
*Control.Scan.CSV> unwordsBy ',' ans ~>
"REQMT207,REQMT228,REQMT57,REQMT228,REQMT57"

So this was a little exercise: How to convert this line:

REQMT207|REQMT228, REQMT57|REQMT228, REQMT57

Into 'just' this csv line:

REQMT207,REQMT228,REQMT57,REQMT228,REQMT57
--}
49:5: Warning: Redundant bracket
Found:
if fn h then ([] : rendBy fn t) else
(second (rendBy fn . softtail) >>> uncurry (:)) $ break fn line
Why not:
if fn h then [] : rendBy fn t else
(second (rendBy fn . softtail) >>> uncurry (:)) $ break fn line