aboutsummaryrefslogtreecommitdiffhomepage
path: root/core/src/Text/Edifact/Parsing
diff options
context:
space:
mode:
authorFrédéric Menou <frederic.menou@fretlink.com>2016-12-08 10:19:15 +0200
committerIsmaël Bouya <ismael.bouya@fretlink.com>2022-05-17 18:01:51 +0200
commita9d77a20008efe82862cc1adbfa7a6d4f09f8ff7 (patch)
treeadf3186fdccaeef19151026cdfbd38a530cf9ecb /core/src/Text/Edifact/Parsing
downloadedi-parser-master.tar.gz
edi-parser-master.tar.zst
edi-parser-master.zip
Release code as open sourceHEADmaster
Diffstat (limited to 'core/src/Text/Edifact/Parsing')
-rw-r--r--core/src/Text/Edifact/Parsing/Combinators.hs230
-rw-r--r--core/src/Text/Edifact/Parsing/Commons.hs173
-rw-r--r--core/src/Text/Edifact/Parsing/Primitives.hs127
3 files changed, 530 insertions, 0 deletions
diff --git a/core/src/Text/Edifact/Parsing/Combinators.hs b/core/src/Text/Edifact/Parsing/Combinators.hs
new file mode 100644
index 0000000..ce3f4be
--- /dev/null
+++ b/core/src/Text/Edifact/Parsing/Combinators.hs
@@ -0,0 +1,230 @@
1{-# LANGUAGE TupleSections #-}
2
3{-|
4Module : Text.Edifact.Parsing.Combinators
5Description : High level combinators
6 -}
7module Text.Edifact.Parsing.Combinators
8 ( -- * Combinators
9 -- ** Values parsers
10 message
11 , segmentGroup
12 , segment
13 , composite
14 , simple
15
16 -- ** Position and strictness
17 , position
18 , (.@)
19 , (@.)
20 , mandatory
21 , optional
22
23 -- ** Repetition of segments and segment groups
24 , repeated
25 , repeatedAtLeastOnce
26 , once
27 , maybeOnce
28 ) where
29
30import Text.Edifact.Parsing.Commons
31import Text.Edifact.Types
32
33import Text.Parsec (lookAhead, many1, optionMaybe,
34 string, try)
35import qualified Text.Parsec as P (many)
36
37-- | Parses a 'Message'.
38--
39-- > messageABCDEF :: Parser Value
40-- > messageABCDEF =
41-- > let simple1234 = simple "1234" (alphaNumeric `upTo` 35)
42-- > c101 = composite "C101" [ position "010" (mandatory simple1234)
43-- > , position "020" (optional simple1234)
44-- > ]
45-- > segmentABC = segment "ABC" [ position "010" (mandatory c101)
46-- > ]
47-- > in message "ABCDEF" [ position "0010" (mandatory segmentABC)
48-- > ]
49message :: MessageCode -> [Parser (Position, [Value])] -> Parser Value
50message code ps =
51 let description = "message " <> show code
52 in Message code <$> sequence ps <??> description
53
54-- | Parses a 'Group'.
55--
56-- A Segment Group is the way Edifact format represents hierarchy. One can view
57-- a segment group as a sub message. A segment group can be repeated like
58-- segments. A segment group wraps segments and segment groups.
59segmentGroup :: GroupCode -> [Parser (Position, [Value])] -> Parser Value
60segmentGroup code ps =
61 let description = "segment-group " <> show code
62 in Group code <$> sequence ps <??> description
63
64-- | Parses a 'Segment'.
65--
66-- Following parser:
67--
68-- > segmentABC :: Parser Value
69-- > segmentABC =
70-- > let simple1234 = simple "1234" (alphaNumeric `upTo` 35)
71-- > simple2001 = simple "2001" (alphaNumeric `exactly` 3)
72-- > c101 = composite "C101" [ position "010" (mandatory simple1234)
73-- > , position "020" (optional simple1234)
74-- > , position "030" (optional simple1234)
75-- > ]
76-- > in segment "ABC" [ position "010" (mandatory simple2001)
77-- > , position "020" (optional c101)
78-- > ]
79--
80-- would parse strings such as:
81--
82-- >>> parse segmentABC "ABC+123'"
83-- Segment "ABC" [ ("010", Just (Simple "2001" "123"))
84-- ]
85-- >>> parse segmentABC "ABC+123+abcdefgh'"
86-- Segment "ABC" [ ("010", Just (Simple "2001" "123"))
87-- , ("020", Just (Composite "C101" [ ("010", Just (Simple "1234" "abcdefgh"))
88-- ]
89-- ))
90-- ]
91-- >>> parse segmentABC "ABC+123+abcdefgh:ijklmno'"
92-- Segment "ABC" [ ("010", Just (Simple "2001" "123"))
93-- , ("020", Just (Composite "C101" [ ("010", Just (Simple "1234" "abcdefgh"))
94-- , ("020", Just (Simple "1234" "ijklmno"))
95-- ]
96-- ))
97-- ]
98segment :: SegmentCode -> [Parser (Position, Maybe Value)] -> Parser Value
99segment code parsers =
100 let go [] = [] <$ parseSegmentSeparator
101 go (p:ps) =
102 tries [ [] <$ parseSegmentSeparator
103 , (:) <$> (parseElementSeparator *> p)
104 <*> go ps
105 ]
106 description = "segment " <> show code
107 in Segment <$> parseSegmentCode code
108 <*> go parsers
109 <??> description
110
111parseSegmentCode :: SegmentCode -> Parser SegmentCode
112parseSegmentCode (SegmentCode code) =
113 let description = "segment code " <> show code
114 in SegmentCode <$> string code <??> description
115
116-- | Parses a 'Composite' element.
117--
118-- Following parser:
119--
120-- > compositeC101 :: Parser Value
121-- > compositeC101 =
122-- > let simple1234 = simple "1234" (alphaNumeric `upTo` 35)
123-- > in composite "C101" [ position "010" (mandatory simple1234)
124-- > , position "020" (optional simple1234)
125-- > , position "030" (optional simple1234)
126-- > ]
127--
128-- would parse strings such as:
129--
130-- >>> parse compositeC101 "abcdefgh"
131-- Composite "C101" [ ("010", Just (Simple "1234" "abcdefgh"))
132-- ]
133-- >>> parse compositeC101 "abcdefgh:ijklmno"
134-- Composite "C101" [ ("010", Just (Simple "1234" "abcdefgh"))
135-- , ("020", Just (Simple "1234" "ijklmno"))
136-- ]
137-- >>> parse compositeC101 "abcdefgh::pqrstu"
138-- Composite "C101" [ ("010", Just (Simple "1234" "abcdefgh"))
139-- , ("020", Just (Simple "1234" ""))
140-- , ("030", Just (Simple "1234" "pqrstu"))
141-- ]
142composite :: CompositeCode -> [Parser (Position, Maybe Value)] -> Parser Value
143composite code parsers =
144 let go [] = pure []
145 go (p:ps) = do
146 let parseSeparator = tries [ parseCompositeSeparator
147 , lookAhead parseElementSeparator
148 , lookAhead parseSegmentSeparator
149 ]
150 (value, continuation) <- tries [ (, ps) <$> p <* parseSeparator
151 , (, []) <$> p
152 ]
153 (:) value <$> go continuation
154 description = "composite element " <> show code
155 in Composite code <$> go parsers <??> description
156
157-- | Parses a 'Simple' element.
158--
159-- Following parser would parse strings of size between 0 and 35 characters.
160--
161-- > simple1234 :: Parser Value
162-- > simple1234 = simple "1234" (alphaNumeric `upTo` 35)
163simple :: SimpleCode -> Parser Primitive -> Parser Value
164simple code p =
165 let description = "simple element " <> show code
166 in Simple code <$> p <??> description
167
168-- | Makes the parsing of the element optional, which doesn't consume input if the given parser doesn't succeed.
169optional :: Parser Value -> Parser (Maybe Value)
170optional = optionMaybe
171
172-- | Makes the parsing of the element mandatory.
173mandatory :: Parser Value -> Parser (Maybe Value)
174mandatory = fmap Just
175
176-- | Sets the current 'Position'. This is relevant for segment in a message, for composite or simple element in a segment, and for simple element in a composite.
177position :: Position -> Parser (f Value) -> Parser (Position, f Value)
178position pos p =
179 let decorated = (pos,) <$> p
180 in setCurrentPosition pos *> decorated <* resetCurrentPosition
181
182-- | Alias to 'position'.
183--
184-- > compositeC101 :: Parser Value
185-- > compositeC101 =
186-- > let simple1234 = simple "1234" (alphaNumeric `upTo` 35)
187-- > in composite "C101" [ "010" .@ mandatory simple1234
188-- > , "020" .@ optional simple1234
189-- > , "030" .@ optional simple1234
190-- > ]
191(.@) :: Position -> Parser (f Value) -> Parser (Position, f Value)
192(.@) = position
193
194-- | Flipped alias to 'position'.
195--
196-- > compositeC101 :: Parser Value
197-- > compositeC101 =
198-- > let simple1234 = simple "1234" (alphaNumeric `upTo` 35)
199-- > in composite "C101" [ mandatory simple1234 @. "010"
200-- > , optional simple1234 @. "020"
201-- > , optional simple1234 @. "030"
202-- > ]
203(@.) :: Parser (f Value) -> Position -> Parser (Position, f Value)
204(@.) = flip position
205
206-- | For segments or segment groups, let you express how many occurrences.
207repeated :: Int -> Parser a -> Parser [a]
208repeated limit p = do
209 values <- P.many (try p)
210 let parsed = length values
211 if parsed > limit
212 then failWithPosition ("expected up to " <> show limit <> " items, but encountered " <> show parsed)
213 else pure values
214
215-- | For segments or segment groups, let you express how many occurrences with at least one occurrence.
216repeatedAtLeastOnce :: Int -> Parser a -> Parser [a]
217repeatedAtLeastOnce limit p = do
218 values <- many1 (try p)
219 let parsed = length values
220 if parsed > limit
221 then failWithPosition ("expected up to " <> show limit <> " items, but encountered " <> show parsed)
222 else pure values
223
224-- | For segments or segment groups, let you express you expect only one occurrence.
225once :: Parser a -> Parser [a]
226once = fmap pure
227
228-- | For segments or segment groups, let you express you expect one or no occurrence.
229maybeOnce :: Parser a -> Parser [a]
230maybeOnce = fmap (maybe [] pure) . optionMaybe
diff --git a/core/src/Text/Edifact/Parsing/Commons.hs b/core/src/Text/Edifact/Parsing/Commons.hs
new file mode 100644
index 0000000..a1c6150
--- /dev/null
+++ b/core/src/Text/Edifact/Parsing/Commons.hs
@@ -0,0 +1,173 @@
1module Text.Edifact.Parsing.Commons
2 ( -- * Parsing context
3 Parser
4 , Context(..)
5 , CurrentPosition(..)
6 , defaultContext
7
8 -- * State combinators
9 , updateSyntax
10 , setCurrentPosition
11 , resetCurrentPosition
12
13 -- * Syntax helpers
14 -- ** Parsing combinators
15 , parseCompositeSeparator
16 , parseElementSeparator
17 , parseSegmentSeparator
18 , parseEscape
19
20 -- ** State accessors
21 -- | Shortcuts to the syntax in current state. Doesn't alter input stream.
22 , getCompositeSeparator
23 , getElementSeparator
24 , getSegmentSeparator
25 , getDecimalSign
26
27 -- * Context aware failure helpers
28 , failWithPosition
29 , (<??>)
30
31 -- * Parsec extras
32 , tries
33
34 -- * Technical combinators
35 , notYetImplemented
36 ) where
37
38import Text.Edifact.Types (Position, Syntax (..), defaultSyntax)
39
40import Data.Text (Text)
41import Text.Parsec (Parsec, char, choice, endOfLine, getState,
42 modifyState, try, updateState, (<?>))
43
44-- | Defines our "Text.Parsec" context.
45type Parser = Parsec Text Context
46
47data Context =
48 Context
49 { parsingSyntax :: Syntax -- ^ State of the syntax. To be updated on the encounter of the @UNA@ segment.
50 , currentPosition :: CurrentPosition -- ^ Pointer for current position in the parser. Used for enriched parsing error messages.
51 }
52
53defaultContext :: Context
54defaultContext = Context defaultSyntax Undefined
55
56-- | Current position in the parser.
57--
58-- For now it only stores the current position in a message, a segment group,
59-- a segment, or a composite.
60--
61-- Future version could store the whole path to improve debugging.
62data CurrentPosition = Undefined
63 | Defined Position
64 deriving Show
65
66getSyntax :: Parser Syntax
67getSyntax = parsingSyntax <$> getState
68
69-- | Get current charactor for decimal sign.
70-- It doesn't parse nor consume input.
71getDecimalSign :: Parser Char
72getDecimalSign = decimalSign <$> getSyntax
73
74-- | Get current charactor for segment separator.
75-- It doesn't parse nor consume input.
76getSegmentSeparator :: Parser Char
77getSegmentSeparator = segmentSeparator <$> getSyntax
78
79-- | Get current charactor for element separator.
80-- It doesn't parse nor consume input.
81getElementSeparator :: Parser Char
82getElementSeparator = elementSeparator <$> getSyntax
83
84-- | Get current charactor for composite separator.
85-- It doesn't parse nor consume input.
86getCompositeSeparator :: Parser Char
87getCompositeSeparator = compositeSeparator <$> getSyntax
88
89-- | This let change the operators used in the parsing. This is designed for the @UNA@ segment.
90updateSyntax :: Syntax -> Parser ()
91updateSyntax s = updateState (\ c -> c { parsingSyntax = s })
92
93-- | Read the parser state to extract current position.
94-- It doesn't parse nor consume input.
95getCurrentPosition :: Parser CurrentPosition
96getCurrentPosition = currentPosition <$> getState
97
98-- | Write the parser state to update current position.
99-- It doesn't parse nor consume input.
100setCurrentPosition :: Position -> Parser ()
101setCurrentPosition = updateCurrentPosition . Defined
102
103-- | Write the parser state to reset current position.
104-- It doesn't parse nor consume input.
105resetCurrentPosition :: Parser ()
106resetCurrentPosition = updateCurrentPosition Undefined
107
108updateCurrentPosition :: CurrentPosition -> Parser ()
109updateCurrentPosition pos = modifyState (\s -> s { currentPosition = pos })
110
111-- | Parse current charactor for element separator.
112-- It does parse and consume input.
113parseElementSeparator :: Parser Char
114parseElementSeparator = parseSpecialChar "element separator" elementSeparator
115
116-- | Parse current charactor for composite separator.
117-- It does parse and consume input.
118parseCompositeSeparator :: Parser Char
119parseCompositeSeparator = parseSpecialChar "composite separator" compositeSeparator
120
121-- | Parse current charactor for escape separator.
122-- It does parse and consume input.
123parseEscape :: Parser Char
124parseEscape = parseSpecialChar "escape character" escape
125
126-- | Parse current charactor for segment separator.
127-- It does parse and consume input.
128--
129-- It also tries consuming end of line after segment separator if any.
130parseSegmentSeparator :: Parser Char
131parseSegmentSeparator = tries [ parseSpecialChar "segment separator" segmentSeparator <* endOfLine
132 , parseSpecialChar "segment separator" segmentSeparator
133 ]
134
135parseSpecialChar :: String -> (Syntax -> Char) -> Parser Char
136parseSpecialChar description reader = do
137 c <- reader <$> getSyntax
138 let escape' '\"' = "\\\""
139 escape' c' = [c']
140 comment = description <> " (\"" <> escape' c <> "\")"
141 char c <?> comment
142
143-- | Let you try various parsers, not consuming until success.
144tries :: [Parser a] -> Parser a
145tries = choice . map try
146
147-- | Like 'fail', this operator let you annotate a parser if it were to fail.
148-- The difference with the standard "Text.Parsec" operator is that it appends
149-- the current position if any.
150failWithPosition :: String -> Parser a
151failWithPosition = withPosition fail
152
153-- | Like '<?>', this operator let you annotate a parser if it were to fail.
154-- The difference with the standard "Text.Parsec" operator is that it appends
155-- the current position if any.
156(<??>) :: Parser a -> String -> Parser a
157(<??>) = withPosition . (<?>)
158
159-- Same priority as <?> from Text.Parsec
160infix 0 <??>
161
162withPosition :: (String -> Parser a) -> String -> Parser a
163withPosition continuation msg =
164 let mkMessage Undefined = msg
165 mkMessage (Defined d) = msg <> " at position " <> show d
166 getMessage = mkMessage <$> getCurrentPosition
167 in getMessage >>= continuation
168
169-- | Alias to 'failWithPosition' to convey semantics of work-in-progress when
170-- writing a parser. This might be useful if you want to partially support a
171-- message.
172notYetImplemented :: String -> Parser a
173notYetImplemented = failWithPosition
diff --git a/core/src/Text/Edifact/Parsing/Primitives.hs b/core/src/Text/Edifact/Parsing/Primitives.hs
new file mode 100644
index 0000000..61659c8
--- /dev/null
+++ b/core/src/Text/Edifact/Parsing/Primitives.hs
@@ -0,0 +1,127 @@
1{-|
2Module : Text.Edifact.Parsing.Primitives
3Description : Low level combinators
4
5This module let you build parsers for primitive values, ie. values contained
6in a simple element, either text or number.
7
8= Examples
9
10To parse a text of 3 characters (@an3@ in standard Edifact representation):
11
12> an3 :: Parser Primitive
13> an3 = alphaNumeric `exactly` 3
14
15To parse a text of up to 10 characters (@an..10@ in standard Edifact representation):
16
17> an_10 :: Parser Primitive
18> an_10 = alphaNumeric `upTo` 10
19
20= Known limitations
21
22Numeric representation is not strictly compatible to the specification.
23The specification tells that negative sign (@-@) and decimal sign (@.@) are not
24to be counted in the length of the field.
25
26Therefore the following parser will fail even it's legal according to the
27specification:
28
29> n_3 :: Parser Primitive
30> n_3 = numeric `upTo` 3
31>
32> parse n_3 "-12.3"
33
34To be fixed, we have to change the way primitives combinators are built so that
35the 'upTo' and 'exactly' combinators are aware of the inner parser.
36 -}
37module Text.Edifact.Parsing.Primitives
38 (
39 -- * Primitives
40 -- ** Simple elements definition
41 alphaNumeric
42 , alpha
43 , numeric
44
45 -- ** Cardinality
46 , exactly
47 , upTo
48 , many
49
50 ) where
51
52import Text.Edifact.Parsing.Commons
53import Text.Edifact.Types
54
55import Data.String (fromString)
56import qualified Data.Text as T (length)
57import Text.Parsec (count, lookAhead, many1, noneOf,
58 oneOf)
59import qualified Text.Parsec as P (many)
60
61-- | Parser associated with the @an@ notation.
62alphaNumeric :: Parser Char
63alphaNumeric = do
64 separators <- sequence [ getSegmentSeparator
65 , getElementSeparator
66 , getCompositeSeparator
67 ]
68 tries [ parseEscape *> parseSegmentSeparator
69 , parseEscape *> parseElementSeparator
70 , parseEscape *> parseCompositeSeparator
71 , parseEscape *> parseEscape
72 , noneOf separators
73 ]
74
75-- | Parser associated with the @a@ notation.
76--
77-- So far it's simply an alias to 'alphaNumeric'.
78alpha :: Parser Char
79alpha = alphaNumeric
80
81-- | Parser associated with the @n@ notation.
82numeric :: Parser Char
83numeric = do
84 punctuationSign <- getDecimalSign
85 oneOf (punctuationSign : "0123456789-")
86
87-- | Combinator to build a parser of primitive which length is unspecified.
88--
89-- Correspondance with the Edifact notation:
90--
91-- > many alpha # same as a
92-- > many numeric # same as n
93-- > many alphaNumeric # same as an
94many :: Parser Char -> Parser Primitive
95many = fmap fromString . many1
96
97-- | Combinator to build a parser of primitive which length is capped.
98--
99-- Correspondance with the Edifact notation:
100--
101-- > alpha `upTo` 3 # same as a..3
102-- > numeric `upTo` 3 # same as n..3
103-- > alphaNumeric `upTo` 3 # same as an..3
104upTo :: Parser Char -> Int -> Parser Primitive
105upTo p c =
106 let check t =
107 let c' = T.length t
108 in if c' > c
109 then failWithPosition ("expected up to " <> show c <> " characters, but encountered " <> show c')
110 else pure (String t)
111 maybeEmpty = (<$) mempty . lookAhead
112 in check =<<
113 tries [ maybeEmpty parseSegmentSeparator
114 , maybeEmpty parseElementSeparator
115 , maybeEmpty parseCompositeSeparator
116 , fromString <$> P.many p
117 ]
118
119-- | Combinator to build a parser of primitive which length is fixed.
120--
121-- Correspondance with the Edifact notation:
122--
123-- > alpha `exactly` 3 # same as a3
124-- > numeric `exactly` 3 # same as n3
125-- > alphaNumeric `exactly` 3 # same as an3
126exactly :: Parser Char -> Int -> Parser Primitive
127exactly p c = fromString <$> count c p