lojban morphology parser

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
{-# LANGUAGE RecursiveDo #-}
import Prelude hiding (not)
import Text.Parsers.Frisby
import Data.Char (toUpper)

not = doesNotMatch
loption = option []
cmany f = concat `fmap` many f 
cmany1 f = concat `fmap` many1 f 

parser = mdo
    words       <- newRule $ optional pause ->> cmany (word <<- optional pause)
    word        <- newRule $ lojbanWord // nonLojbanWord
    lojbanWord  <- newRule $ cmene // cmavo // brivla
    brivla      <- newRule $ gismu // fuhivla // lujvo
    cmene       <- newRule $ jbocme // zifcme
    zifcme      <- newRule $ not h ->> cmany (nucleus // glide // h // consonant <<- doesNotMatch pause // digit)
                                  ->> consonant
                                  ->> matches pause
    jbocme      <- newRule $ matches zifcme ->> cmany (anySyllable // digit) <<- matches pause

    cmavo       <- newRule $ not cmene ->> not cvcyLujvo ->> cmavoForm <<- matches postWord
    cvcyLujvo   <- newRule $ cvcRafsi <++> y <++> loption h <++> cmany initialRafsi <++> brivlaCore
                // stressedCvcRafsi <++> y <++> shortFinalRafsi
    cmavoForm   <- newRule $ not h ->> not cluster ->> onset <++> cmany (nucleus <++> h) <++> (not stressed ->> nucleus // nucleus <<- not cluster)
                // cmany1 y
                // digit

    lujvo       <- newRule $ not gismu ->> not fuhivla ->> not cmavo ->> cmany initialRafsi <++> brivlaCore
    brivlaCore  <- newRule $ fuhivla // gismu // cvvFinalRafsi // stressedInitialRafsi <++> shortFinalRafsi
    stressedInitialRafsi <- newRule $ stressedExtendedRafsi // stressedYRafsi // stressedYLessRafsi
    initialRafsi <- newRule $ extendedRafsi // yRafsi // not anyExtendedRafsi ->> yLessRafsi
    anyExtendedRafsi <- newRule $ fuhivla // extendedRafsi // stressedExtendedRafsi

    fuhivla     <- newRule $ fuhivlaHead <++> stressedSyllable <++> cmany consonantalSyllable <++> finalSyllable
    stressedExtendedRafsi <- newRule $ stressedBrivlaRafsi // stressedFuhivlaRafsi
    extendedRafsi <- newRule $ brivlaRafsi // fuhivlaRafsi
    stressedBrivlaRafsi <- newRule $ matches unstressedSyllable ->> brivlaHead <++> stressedSyllable <++> h <++> y
    brivlaRafsi <- newRule $ matches (syllable <<- cmany consonantalSyllable <<- syllable) ->> brivlaHead <++> h <++> y <++> loption h
    stressedFuhivlaRafsi <- newRule $ fuhivlaHead <++> stressedSyllable <++> (matches consonant ->> onset <++> y)
    fuhivlaRafsi <- newRule $ not unstressedSyllable ->> (fuhivlaHead <<- stressedSyllable) <++> (matches consonant ->> onset <++> y)
    fuhivlaHead <- newRule $ not rafsiString ->> brivlaHead
    brivlaHead  <- newRule $ not cmavo ->> not slinkuhi ->> not h ->> matches onset ->> cmany unstressedSyllable
    slinkuhi    <- newRule $ consonant <++> rafsiString
    rafsiString <- newRule $ cmany yLessRafsi 
                    <++> (gismu 
                        // cvvFinalRafsi 
                        // stressedYLessRafsi <++> shortFinalRafsi
                        // yRafsi
                        // stressedYRafsi
                        // loption stressedYLessRafsi <++> initialPair <++> y
                        )
    gismu       <- newRule $ ((initialPair <++> stressedVowel // consonant <++> stressedVowel <++> consonant) <<- matches finalSyllable) 
                    <++> consonant <++> vowel <<- matches postWord
    cvvFinalRafsi <- newRule $ consonant <++> stressedVowel <++> h <++> matches finalSyllable ->> vowel <<- matches postWord
    shortFinalRafsi <- newRule $ matches finalSyllable ->> (consonant <++> diphthong // initialPair <++> vowel) <<- matches postWord
    stressedYRafsi <- newRule $ (stressedLongRafsi // stressedCvcRafsi) <++> y
    stressedYLessRafsi <- newRule $ stressedCvcRafsi <<- not y // stressedCcvRafsi // stressedCvvRafsi
    stressedLongRafsi <- newRule $ initialPair <++> stressedVowel <++> consonant // consonant <++> stressedVowel <++> consonant <++> consonant
    stressedCvcRafsi <- newRule $ consonant <++> stressedVowel <++> consonant
    stressedCcvRafsi <- newRule $ initialPair <++> stressedVowel
    stressedCvvRafsi <- newRule $ consonant <++> (unstressedVowel <++> h <++> stressedVowel // stressedDiphthong) <++> loption rHyphen
    yRafsi      <- newRule $ (longRafsi // cvcRafsi) <++> y <++> loption h
    yLessRafsi  <- newRule $ not yRafsi ->> (cvcRafsi <<- not y // ccvRafsi // cvvRafsi) <<- not anyExtendedRafsi
    longRafsi  <- newRule $ initialPair <++> unstressedVowel <++> consonant // consonant <++> unstressedVowel <++> consonant  <++> consonant
    cvcRafsi    <- newRule $ consonant <++> unstressedVowel <++> consonant
    ccvRafsi    <- newRule $ initialPair <++> unstressedVowel
    cvvRafsi    <- newRule $ consonant <++> (unstressedVowel  <++> h <++> unstressedVowel // unstressedDiphthong) <++> loption rHyphen
    rHyphen     <- newRule $ r <<- matches consonant // n <<- matches r


    finalSyllable <- newRule $ (onset <<- not y <<- not stressed) <++> (nucleus <<- not cmene <<- matches postWord)
    stressedSyllable <- newRule $ matches stressed ->> syllable // syllable <<- matches stress
    stressedDiphthong <- newRule $ matches stressed ->> diphthong // diphthong <<- matches stress
    stressedVowel <- newRule $ matches stressed ->> vowel // vowel <<- matches stressed
    unstressedSyllable <- newRule $ not stressed ->> syllable <<- not stress // consonantalSyllable
    unstressedDiphthong <- newRule $ not stressed ->> diphthong <<- not stress
    unstressedVowel <- newRule $ not stressed ->> vowel <<- not stress
    stress <- newRule $ cmany consonant <++> loption y <++> syllable <++> pause
    stressed <- newRule $ onset <++> cmany comma <++> choice [a,e,i,o,u]
    anySyllable <- newRule $ onset <++> nucleus <++> loption coda // consonantalSyllable
    syllable <- newRule $ (onset <<- not y) <++> nucleus <++> loption l
    consonantalSyllable <- newRule $ consonant <++> syllabic <<- matches (consonantalSyllable // onset) <++> loption (consonant <<- matches spaces)
    coda <- newRule $ not anySyllable ->> consonant <<- anySyllable // loption syllabic <++> loption consonant <<- matches pause
    onset <- newRule $ h // loption consonant <++> glide // initial
    nucleus <- newRule $ vowel // diphthong // y <<- not nucleus


    glide <- newRule $ (i/u) <<- matches nucleus <<- not glide
    diphthong <- newRule $ (a <++> i // a <++> u // e <++> i // o <++> i) <<- not nucleus <<- not glide
    vowel <- newRule $ (a // e // i // o // u) <<- not nucleus
    let letter x = cmany comma ->> oneOf' ([x,toUpper x])
    a <- newRule $ letter 'a'
    e <- newRule $ letter 'e'
    i <- newRule $ letter 'i'
    o <- newRule $ letter 'o'
    u <- newRule $ letter 'u'
    y <- newRule $ letter 'y'

    cluster <- newRule $ consonant <++> cmany1 consonant
    initialPair <- newRule $ matches initial ->> consonant <++> consonant <<- not consonant
    initial <- newRule $ (affricate // loption sibilant <++> loption other <++> loption liquid) <<- not consonant <<- not glide
    affricate <- newRule $ t <++> c // t <++> s // d <++> j // d <++> z
    liquid <- newRule $ l // r
    other <- newRule $ p // t <<- not l // k // f // x // b // d <<- not l // g // v // m // n <<- not liquid
    sibilant <- newRule $ c // s <<- not x // (j // z) <<- not n <<- not liquid
    consonant <- newRule $ voiced // unvoiced // syllabic
    syllabic <- newRule $ l // m // n // r
    voiced <- newRule $ b // d // g // j // v // z
    unvoiced <- newRule $ c // f // k // p // s // t // x

    l <- newRule $ letter 'l' <<- not h <<- not l
    m <- newRule $ letter 'm' <<- not h <<- not m <<- not z
    n <- newRule $ letter 'n' <<- not h <<- not n <<- not affricate
    r <- newRule $ letter 'r' <<- not h <<- not r
    b <- newRule $ letter 'b' <<- not h <<- not b <<- not unvoiced
    d <- newRule $ letter 'd' <<- not h <<- not d <<- not unvoiced
    g <- newRule $ letter 'g' <<- not h <<- not g <<- not unvoiced
    v <- newRule $ letter 'v' <<- not h <<- not v <<- not unvoiced
    j <- newRule $ letter 'j' <<- not h <<- not j <<- not z <<- not unvoiced
    z <- newRule $ letter 'z' <<- not h <<- not z <<- not j <<- not unvoiced
    s <- newRule $ letter 's' <<- not h <<- not s <<- not c <<- not voiced
    c <- newRule $ letter 'c' <<- not h <<- not c <<- not s <<- not x <<- not voiced
    x <- newRule $ letter 'x' <<- not h <<- not x <<- not c <<- not x <<- not voiced
    k <- newRule $ letter 'k' <<- not h <<- not k <<- not x <<- not voiced
    f <- newRule $ letter 'f' <<- not h <<- not f <<- not voiced
    p <- newRule $ letter 'p' <<- not h <<- not p <<- not voiced
    t <- newRule $ letter 't' <<- not h <<- not t <<- not voiced
    h <- newRule $ cmany comma ->> oneOf' "'h" <++> nucleus
    

    digit <- newRule $ cmany comma ->> oneOf' "0123456789" <<- not h <<- not nucleus
    postWord <- newRule $ pause // not nucleus ->> lojbanWord
    pause <- newRule $ cmany comma <++> cmany1 spaceChar // eOF
    eOF <- newRule $ cmany comma <<- not (char '.')
    comma <- newRule $ text ","
    nonLojbanWord <- newRule $ not lojbanWord ->> cmany nonSpace
    nonSpace <- newRule $ not spaceChar ->> anyChar
    spaceChar <- newRule $ oneOf' ".\t\n\r?!\0020"

    spaces <- newRule $ not cY ->> initialSpaces
    initialSpaces <- newRule $ cmany1 (cmany comma <++> spaceChar // not ybu ->> cY) <++> loption eOF // eOF
    ybu <- newRule $ cY <++> cmany spaceChar <++> cBU

    cBU <- newRule $ matches cmavo ->> b <++> u <<- matches postWord
    cY <- newRule $ matches cmavo ->> many y <<- matches postWord
    return words

oneOf' = fmap (:[]) . oneOf


92:43: Error: Redundant bracket
Found:
([x, toUpper x])
Why not:
[x, toUpper x]