diff options
author | Frédéric Menou <frederic.menou@fretlink.com> | 2016-12-08 10:19:15 +0200 |
---|---|---|
committer | Ismaël Bouya <ismael.bouya@fretlink.com> | 2022-05-17 18:01:51 +0200 |
commit | a9d77a20008efe82862cc1adbfa7a6d4f09f8ff7 (patch) | |
tree | adf3186fdccaeef19151026cdfbd38a530cf9ecb /scaffolder/src/Text/Edifact/Fetcher | |
download | edi-parser-a9d77a20008efe82862cc1adbfa7a6d4f09f8ff7.tar.gz edi-parser-a9d77a20008efe82862cc1adbfa7a6d4f09f8ff7.tar.zst edi-parser-a9d77a20008efe82862cc1adbfa7a6d4f09f8ff7.zip |
Diffstat (limited to 'scaffolder/src/Text/Edifact/Fetcher')
-rw-r--r-- | scaffolder/src/Text/Edifact/Fetcher/Commons.hs | 87 | ||||
-rw-r--r-- | scaffolder/src/Text/Edifact/Fetcher/Composites.hs | 31 | ||||
-rw-r--r-- | scaffolder/src/Text/Edifact/Fetcher/Configuration.hs | 43 | ||||
-rw-r--r-- | scaffolder/src/Text/Edifact/Fetcher/Messages.hs | 84 | ||||
-rw-r--r-- | scaffolder/src/Text/Edifact/Fetcher/Segments.hs | 41 | ||||
-rw-r--r-- | scaffolder/src/Text/Edifact/Fetcher/Simples.hs | 27 |
6 files changed, 313 insertions, 0 deletions
diff --git a/scaffolder/src/Text/Edifact/Fetcher/Commons.hs b/scaffolder/src/Text/Edifact/Fetcher/Commons.hs new file mode 100644 index 0000000..1a6a058 --- /dev/null +++ b/scaffolder/src/Text/Edifact/Fetcher/Commons.hs | |||
@@ -0,0 +1,87 @@ | |||
1 | {-# LANGUAGE OverloadedStrings #-} | ||
2 | |||
3 | module Text.Edifact.Fetcher.Commons where | ||
4 | |||
5 | import Text.Edifact.Fetcher.Configuration | ||
6 | import Text.Edifact.Scaffolder.Commons | ||
7 | |||
8 | import Control.Monad ((>=>)) | ||
9 | import Control.Monad.Error.Class (MonadError, catchError) | ||
10 | import Control.Monad.IO.Class (MonadIO) | ||
11 | import Control.Monad.Reader (ReaderT, runReaderT) | ||
12 | import Control.Monad.Trans.Class (lift) | ||
13 | import Data.ByteString (ByteString) | ||
14 | import Data.Char (toLower) | ||
15 | import Data.Text as T (unpack, pack) | ||
16 | import Data.Text.IO as TIO (readFile, writeFile) | ||
17 | import Formatting | ||
18 | import System.Directory (doesFileExist) | ||
19 | import Text.Pandoc as Pandoc hiding (Format, | ||
20 | getOutputFile) | ||
21 | import Text.Parsec (Parsec, runParser) | ||
22 | |||
23 | type Fetcher = ReaderT FetchingEnv PandocIO | ||
24 | |||
25 | runFetcher :: Fetcher () -> FetchingEnv -> IO () | ||
26 | runFetcher f = Pandoc.runIOorExplode . runReaderT f | ||
27 | |||
28 | getOutputFile :: Format String (a -> String) -> FilePath -> a -> Fetcher FilePath | ||
29 | getOutputFile f d c = do | ||
30 | home <- getSpecificationHome | ||
31 | pure (formatToString (string % "/" % string % "/" % f) home d c) | ||
32 | |||
33 | getUrl :: Format String (a -> String) -> a -> Fetcher String | ||
34 | getUrl f c = do | ||
35 | rev <- getTargetRevision | ||
36 | pure (formatToString ("https://service.unece.org/trade/untdid/" % fRevisionLower % f) rev c) | ||
37 | |||
38 | getSpecificationHome :: Fetcher FilePath | ||
39 | getSpecificationHome = do | ||
40 | home <- getHome | ||
41 | rev <- getTargetRevision | ||
42 | pure (home </> formatToString fRevision rev) | ||
43 | |||
44 | htmlToFile :: String -> (ByteString -> Text) -> FilePath -> Fetcher () | ||
45 | htmlToFile url decoder outputFile = () <$ tryCacheOrHtml decoder url outputFile | ||
46 | |||
47 | htmlToFileWithParser :: (Monoid a, Monoid u) => String -> (ByteString -> Text) -> FilePath -> Parsec String u a -> Fetcher a | ||
48 | htmlToFileWithParser url decoder outputFile parser = do | ||
49 | specification <- tryCacheOrHtml decoder url outputFile | ||
50 | either (error . show) pure | ||
51 | (runParser parser mempty "" (T.unpack specification)) | ||
52 | |||
53 | tryCacheOrHtml :: (ByteString -> Text) -> String -> FilePath -> Fetcher Text | ||
54 | tryCacheOrHtml decoder url path = do | ||
55 | fileExists' <- liftIO $ doesFileExist path | ||
56 | content <- if fileExists' | ||
57 | then liftIO $ TIO.readFile path | ||
58 | else readHtmlFromURL decoder url >>= writePlain def | ||
59 | content <$ liftIO (TIO.writeFile path content) | ||
60 | |||
61 | readHtmlFromURL :: (ByteString -> Text) -> String -> Fetcher Pandoc | ||
62 | readHtmlFromURL decoder = lift . (openURL >=> readHtml def . decoder . fst) . pack | ||
63 | |||
64 | lower :: Format r (String -> r) | ||
65 | lower = mapf (fmap toLower) string | ||
66 | |||
67 | fRevision :: Format r (Revision -> r) | ||
68 | fRevision = mapf getRevision string | ||
69 | |||
70 | fRevisionLower :: Format r (Revision -> r) | ||
71 | fRevisionLower = mapf getRevision lower | ||
72 | |||
73 | fMessageCodeLower :: Format r (MessageCode -> r) | ||
74 | fMessageCodeLower = mapf getMessageCode lower | ||
75 | |||
76 | fSegmentCodeLower :: Format r (SegmentCode -> r) | ||
77 | fSegmentCodeLower = mapf getSegmentCode lower | ||
78 | |||
79 | fCompositeCodeLower :: Format r (CompositeCode -> r) | ||
80 | fCompositeCodeLower = mapf getCompositeCode lower | ||
81 | |||
82 | fSimpleCodeLower :: Format r (SimpleCode -> r) | ||
83 | fSimpleCodeLower = mapf getSimpleCode lower | ||
84 | |||
85 | retry :: (MonadIO m, MonadError b m) => Int -> m a -> m a | ||
86 | retry n f | n > 1 = f `catchError` const (say "Retrying" >> retry (n-1) f) | ||
87 | | otherwise = f | ||
diff --git a/scaffolder/src/Text/Edifact/Fetcher/Composites.hs b/scaffolder/src/Text/Edifact/Fetcher/Composites.hs new file mode 100644 index 0000000..8f94cc9 --- /dev/null +++ b/scaffolder/src/Text/Edifact/Fetcher/Composites.hs | |||
@@ -0,0 +1,31 @@ | |||
1 | {-# LANGUAGE OverloadedStrings #-} | ||
2 | |||
3 | module Text.Edifact.Fetcher.Composites | ||
4 | ( fetchComposites | ||
5 | , compositesDirectory | ||
6 | ) where | ||
7 | |||
8 | import Text.Edifact.Fetcher.Commons | ||
9 | import Text.Edifact.Scaffolder.Commons | ||
10 | |||
11 | import Text.Edifact.Scaffolder.Composites.Specification (listSimples) | ||
12 | |||
13 | import Data.List as L (nub, | ||
14 | sort) | ||
15 | import Data.Text.Encoding as TE (decodeUtf8) | ||
16 | import Formatting | ||
17 | |||
18 | compositesDirectory :: FilePath | ||
19 | compositesDirectory = "composites" | ||
20 | |||
21 | fetchComposites :: ([CompositeCode], [SimpleCode]) -> Fetcher [SimpleCode] | ||
22 | fetchComposites (composites, segments) = | ||
23 | let compactSimpleCodes = L.nub . L.sort . mappend segments . concat | ||
24 | in compactSimpleCodes <$> traverse (retry 3 . fetchComposite) composites | ||
25 | |||
26 | fetchComposite :: CompositeCode -> Fetcher [SimpleCode] | ||
27 | fetchComposite code = do | ||
28 | say ("Fetching composite " % fCompositeCode) code | ||
29 | url <- getUrl ("/trcd/trcd" % fCompositeCodeLower % ".htm") code | ||
30 | outputFile <- getOutputFile (fCompositeCodeLower % ".txt") compositesDirectory code | ||
31 | htmlToFileWithParser url TE.decodeUtf8 outputFile (snd <$> listSimples) | ||
diff --git a/scaffolder/src/Text/Edifact/Fetcher/Configuration.hs b/scaffolder/src/Text/Edifact/Fetcher/Configuration.hs new file mode 100644 index 0000000..a074641 --- /dev/null +++ b/scaffolder/src/Text/Edifact/Fetcher/Configuration.hs | |||
@@ -0,0 +1,43 @@ | |||
1 | {-# LANGUAGE FlexibleContexts #-} | ||
2 | {-# LANGUAGE OverloadedStrings #-} | ||
3 | |||
4 | module Text.Edifact.Fetcher.Configuration | ||
5 | ( -- * Fetching environment | ||
6 | FetchingEnv(..) | ||
7 | -- * Parsing helpers | ||
8 | , readSelectMessages | ||
9 | -- * Shortcuts for reading the environment | ||
10 | , getTargetRevision | ||
11 | , getHome | ||
12 | , getSelectedMessages | ||
13 | ) where | ||
14 | |||
15 | import Text.Edifact.Scaffolder.Commons (MessageCode, Revision, | ||
16 | messageCodeParser) | ||
17 | |||
18 | import Control.Monad.Reader.Class (MonadReader, asks) | ||
19 | import Data.List.NonEmpty (NonEmpty, nonEmpty) | ||
20 | import Text.Parsec (char, parse, sepBy1) | ||
21 | |||
22 | data FetchingEnv = FetchingEnv { fetchingRevision :: Revision | ||
23 | , specificationHome :: FilePath | ||
24 | , selectedMessages :: Maybe (NonEmpty MessageCode) | ||
25 | } | ||
26 | |||
27 | getTargetRevision :: MonadReader FetchingEnv m => m Revision | ||
28 | getTargetRevision = asks fetchingRevision | ||
29 | |||
30 | getHome :: MonadReader FetchingEnv m => m FilePath | ||
31 | getHome = asks specificationHome | ||
32 | |||
33 | getSelectedMessages :: MonadReader FetchingEnv m => m (Maybe (NonEmpty MessageCode)) | ||
34 | getSelectedMessages = asks selectedMessages | ||
35 | |||
36 | readSelectMessages :: Maybe String -> Maybe (NonEmpty MessageCode) | ||
37 | readSelectMessages value = | ||
38 | let tryParse p s = toMaybe . parse p s | ||
39 | toMaybe (Right v) = Just v | ||
40 | toMaybe _ = Nothing | ||
41 | messageCodesParser = messageCodeParser `sepBy1` comma | ||
42 | comma = char ',' | ||
43 | in value >>= tryParse messageCodesParser "" >>= nonEmpty | ||
diff --git a/scaffolder/src/Text/Edifact/Fetcher/Messages.hs b/scaffolder/src/Text/Edifact/Fetcher/Messages.hs new file mode 100644 index 0000000..9daf98a --- /dev/null +++ b/scaffolder/src/Text/Edifact/Fetcher/Messages.hs | |||
@@ -0,0 +1,84 @@ | |||
1 | {-# LANGUAGE OverloadedStrings #-} | ||
2 | |||
3 | module Text.Edifact.Fetcher.Messages | ||
4 | ( fetchMessages | ||
5 | , messagesDirectory | ||
6 | ) where | ||
7 | |||
8 | import Text.Edifact.Fetcher.Commons | ||
9 | import Text.Edifact.Fetcher.Configuration | ||
10 | import Text.Edifact.Scaffolder.Commons | ||
11 | |||
12 | import Text.Edifact.Scaffolder.Messages.Specification (listSegments) | ||
13 | |||
14 | import Data.Foldable (toList) | ||
15 | import Data.List as L (nub, sort) | ||
16 | import Data.Maybe (mapMaybe) | ||
17 | import Data.Text as T (map) | ||
18 | import Data.Text.Encoding as TE (decodeLatin1, | ||
19 | decodeUtf8) | ||
20 | import Formatting | ||
21 | import Text.Pandoc as Pandoc hiding (Format, | ||
22 | getOutputFile) | ||
23 | import Text.Pandoc.Walk (query) | ||
24 | import Text.Parsec (parse) | ||
25 | |||
26 | messagesDirectory :: FilePath | ||
27 | messagesDirectory = "messages" | ||
28 | |||
29 | parseMessageCode :: Text -> Maybe MessageCode | ||
30 | parseMessageCode = | ||
31 | let toMaybe (Right v) = Just v | ||
32 | toMaybe _ = Nothing | ||
33 | in toMaybe . parse messageCodeParser "" | ||
34 | |||
35 | scanInlineForMessageCode :: Inline -> Maybe MessageCode | ||
36 | scanInlineForMessageCode (Str label) = parseMessageCode label | ||
37 | scanInlineForMessageCode _ = Nothing | ||
38 | |||
39 | -- The trick here is to reverse the usage of UNH which is mandatory on every single message | ||
40 | listAllMessages :: Fetcher [MessageCode] | ||
41 | listAllMessages = | ||
42 | let filterLink (Link _ inlines _) = mapMaybe scanInlineForMessageCode inlines | ||
43 | filterLink _ = [] | ||
44 | extractMessageCodes = query filterLink | ||
45 | loadUNHUsages = readHtmlFromURL TE.decodeUtf8 =<< getUrl ("/trsd/cseg" % fSegmentCodeLower % ".htm") "UNH" | ||
46 | in extractMessageCodes <$> loadUNHUsages | ||
47 | |||
48 | listMessages :: Fetcher [MessageCode] | ||
49 | listMessages = getSelectedMessages >>= maybe listAllMessages (pure . toList) | ||
50 | |||
51 | fetchMessages :: Fetcher [SegmentCode] | ||
52 | fetchMessages = listMessages >>= fmap (L.nub . L.sort . concat) . traverse fetchMessage | ||
53 | |||
54 | fetchMessage :: MessageCode -> Fetcher [SegmentCode] | ||
55 | fetchMessage code = do | ||
56 | retry 3 (fetchMessageDefinition code) | ||
57 | retry 3 (fetchMessageSummary code) | ||
58 | |||
59 | fetchMessageDefinition :: MessageCode -> Fetcher () | ||
60 | fetchMessageDefinition code = do | ||
61 | say ("Fetching message " % fMessageCode % " definition") code | ||
62 | url <- getUrl ("/trmd/" % fMessageCodeLower % "_d.htm") code | ||
63 | outputFile <- getOutputFile (fMessageCodeLower % "_d.txt") messagesDirectory code | ||
64 | htmlToFile url TE.decodeUtf8 outputFile | ||
65 | |||
66 | fetchMessageSummary :: MessageCode -> Fetcher [SegmentCode] | ||
67 | fetchMessageSummary code = do | ||
68 | say ("Fetching message " % fMessageCode % " summary") code | ||
69 | url <- getUrl ("/trmd/" % fMessageCodeLower % "_s.htm") code | ||
70 | outputFile <- getOutputFile (fMessageCodeLower % "_s.txt") messagesDirectory code | ||
71 | let decoder = cleanupAsciiArt . TE.decodeLatin1 | ||
72 | htmlToFileWithParser url decoder outputFile listSegments | ||
73 | |||
74 | -- This might not be the proper way to do it... | ||
75 | -- Use Data.Text.Encoding.decodeUtf8With instead? | ||
76 | cleanupAsciiArt :: Text -> Text | ||
77 | cleanupAsciiArt = | ||
78 | let f 'Ä' = '-' | ||
79 | f '¿' = '+' | ||
80 | f '³' = '|' | ||
81 | f 'Ù' = '+' | ||
82 | f 'Á' = '+' | ||
83 | f c = c | ||
84 | in T.map f | ||
diff --git a/scaffolder/src/Text/Edifact/Fetcher/Segments.hs b/scaffolder/src/Text/Edifact/Fetcher/Segments.hs new file mode 100644 index 0000000..dda1d88 --- /dev/null +++ b/scaffolder/src/Text/Edifact/Fetcher/Segments.hs | |||
@@ -0,0 +1,41 @@ | |||
1 | {-# LANGUAGE OverloadedStrings #-} | ||
2 | |||
3 | module Text.Edifact.Fetcher.Segments | ||
4 | ( fetchSegments | ||
5 | , segmentsDirectory | ||
6 | ) where | ||
7 | |||
8 | import Text.Edifact.Fetcher.Commons | ||
9 | import Text.Edifact.Scaffolder.Commons | ||
10 | |||
11 | import Text.Edifact.Scaffolder.Segments.Specification (listCompositesAndSimples) | ||
12 | import Text.Edifact.Scaffolder.Segments.Types (Element, | ||
13 | getComposite, | ||
14 | getSimple) | ||
15 | |||
16 | import Data.Bifunctor (bimap) | ||
17 | import Data.List as L (nub, | ||
18 | partition, | ||
19 | sort) | ||
20 | import Data.Maybe (isJust, | ||
21 | mapMaybe) | ||
22 | import Data.Text.Encoding as TE (decodeUtf8) | ||
23 | import Formatting | ||
24 | |||
25 | segmentsDirectory :: FilePath | ||
26 | segmentsDirectory = "segments" | ||
27 | |||
28 | fetchSegments :: [SegmentCode] -> Fetcher ([CompositeCode], [SimpleCode]) | ||
29 | fetchSegments = fmap (partitionElements . L.nub . L.sort . concat) . traverse (retry 3 . fetchSegment) | ||
30 | |||
31 | partitionElements :: [Element] -> ([CompositeCode], [SimpleCode]) | ||
32 | partitionElements = | ||
33 | let isComposite = isJust . getComposite | ||
34 | in bimap (mapMaybe getComposite) (mapMaybe getSimple) . partition isComposite | ||
35 | |||
36 | fetchSegment :: SegmentCode -> Fetcher [Element] | ||
37 | fetchSegment code = do | ||
38 | say ("Fetching segment " % fSegmentCode) code | ||
39 | url <- getUrl ("/trsd/trsd" % fSegmentCodeLower % ".htm") code | ||
40 | outputFile <- getOutputFile (fSegmentCodeLower % ".txt") segmentsDirectory code | ||
41 | htmlToFileWithParser url TE.decodeUtf8 outputFile (snd <$> listCompositesAndSimples) | ||
diff --git a/scaffolder/src/Text/Edifact/Fetcher/Simples.hs b/scaffolder/src/Text/Edifact/Fetcher/Simples.hs new file mode 100644 index 0000000..47951ad --- /dev/null +++ b/scaffolder/src/Text/Edifact/Fetcher/Simples.hs | |||
@@ -0,0 +1,27 @@ | |||
1 | {-# LANGUAGE OverloadedStrings #-} | ||
2 | |||
3 | module Text.Edifact.Fetcher.Simples | ||
4 | ( fetchSimples | ||
5 | , simplesDirectory | ||
6 | ) where | ||
7 | |||
8 | import Text.Edifact.Fetcher.Commons | ||
9 | import Text.Edifact.Scaffolder.Commons | ||
10 | |||
11 | |||
12 | import Data.Foldable (traverse_) | ||
13 | import Data.Text.Encoding as TE (decodeUtf8) | ||
14 | import Formatting | ||
15 | |||
16 | simplesDirectory :: FilePath | ||
17 | simplesDirectory = "simples" | ||
18 | |||
19 | fetchSimples :: [SimpleCode] -> Fetcher () | ||
20 | fetchSimples = traverse_ (retry 3 . fetchSimple) | ||
21 | |||
22 | fetchSimple :: SimpleCode -> Fetcher () | ||
23 | fetchSimple code = do | ||
24 | say ("Fetching simple " % fSimpleCode) code | ||
25 | url <- getUrl ("/uncl/uncl" % fSimpleCodeLower % ".htm") code | ||
26 | outputFile <- getOutputFile (fSimpleCodeLower % ".txt") simplesDirectory code | ||
27 | htmlToFile url TE.decodeUtf8 outputFile | ||