/ src / PandocReader.hs
PandocReader.hs
  1  {-# LANGUAGE OverloadedStrings #-}
  2  
  3  module PandocReader (toPandoc, readAutomerge) where
  4  
  5  import Automerge (BlockMarker (..), BlockSpan (..), Heading (..), HeadingLevel (..), Link (..), Mark (..), NoteId (..), Span (..), TextSpan (..), isEmbed, isParent, parseAutomergeSpansText, takeUntilNextSameBlockTypeSibling, takeUntilNonEmbedBlockSpan)
  6  import Control.Monad ((>=>))
  7  import Control.Monad.Except (throwError)
  8  import Data.List (find, groupBy)
  9  import Data.List.NonEmpty (NonEmpty (..), nonEmpty, toList)
 10  import qualified Data.Map as M
 11  import Data.Maybe (fromMaybe)
 12  import qualified Data.Text as T
 13  import Data.Tree (Tree (Node), drawTree, foldTree, unfoldForest)
 14  import Debug.Trace
 15  import Text.Pandoc (PandocError (PandocParseError, PandocSyntaxMapError), ReaderOptions)
 16  import Text.Pandoc.Builder as Pandoc
 17    ( Block (..),
 18      Blocks,
 19      Inline (Note, Str),
 20      Inlines,
 21      ListNumberDelim (DefaultDelim),
 22      ListNumberStyle (DefaultStyle),
 23      Pandoc,
 24      code,
 25      doc,
 26      emph,
 27      fromList,
 28      link,
 29      nullAttr,
 30      singleton,
 31      str,
 32      strong,
 33      toList,
 34    )
 35  import Text.Pandoc.Class (PandocMonad)
 36  import Text.Pandoc.Sources (ToSources, sourcesToText, toSources)
 37  import Utils.Sequence (firstValue)
 38  
 39  -- Although ReaderOptions are not used, the function is written like this so that it's consistent with the other Pandoc reader functions.
 40  readAutomerge :: (PandocMonad m, ToSources a) => ReaderOptions -> a -> m Pandoc
 41  readAutomerge _ =
 42    -- Using Kleisli composition to compose the 2 smaller functions in the monadic context (PandocMonad)
 43    parseSpansAndHandleErrors >=> toPandoc
 44    where
 45      -- Here we parse the JSON Automerge spans but also convert a potential error to a Pandoc parsing error
 46      -- The result is a list of Automerge spans wrapped within a Pandoc monad.
 47      parseSpansAndHandleErrors :: (ToSources a, PandocMonad m) => a -> m [Automerge.Span]
 48      parseSpansAndHandleErrors = either handleParsingErrorMessage pure . parseSpans
 49  
 50      parseSpans :: (ToSources a) => a -> Either String [Span]
 51      parseSpans = parseAutomergeSpansText . sourcesToText . toSources
 52  
 53      handleParsingErrorMessage :: (PandocMonad m) => String -> m a
 54      handleParsingErrorMessage = throwError . PandocParseError . T.pack
 55  
 56  toPandoc :: (PandocMonad m) => [Automerge.Span] -> m Pandoc.Pandoc
 57  toPandoc = (either throwError (pure . Pandoc.doc)) . convertSpansToBlocks
 58    where
 59      convertSpansToBlocks :: [Automerge.Span] -> Either PandocError Pandoc.Blocks
 60      convertSpansToBlocks = fromMaybe (Right $ Pandoc.fromList []) . fmap treeToPandocBlocks . buildTree
 61  
 62  newtype NoteId = NoteId T.Text deriving (Show, Eq, Ord)
 63  
 64  data BlockNode = PandocBlock Pandoc.Block | BulletListItem | OrderedListItem | NoteRef PandocReader.NoteId | NoteContent PandocReader.NoteId deriving (Show)
 65  
 66  data DocNode = Root | BlockNode BlockNode | InlineNode Pandoc.Inlines deriving (Show)
 67  
 68  traceTree :: Tree DocNode -> Tree DocNode
 69  traceTree tree = Debug.Trace.trace (drawTree $ fmap show tree) tree
 70  
 71  buildTree :: [Automerge.Span] -> Maybe (Tree DocNode)
 72  buildTree = (fmap (traceTree . mapNotesToPandocNotes . groupListItems . buildRawTree)) . nonEmpty
 73  
 74  buildRawTree :: NonEmpty Automerge.Span -> Tree DocNode
 75  buildRawTree spans = Node Root $ unfoldForest buildDocNode $ getTopLevelBlockSeeds spansList
 76    where
 77      spansList = Data.List.NonEmpty.toList spans
 78      getTopLevelBlockSeeds = getChildBlockSeeds Nothing
 79  
 80  buildDocNode :: (Automerge.Span, [Automerge.Span]) -> (DocNode, [(Automerge.Span, [Automerge.Span])])
 81  buildDocNode (currentSpan, remainingSpans) = case currentSpan of
 82    -- Non-embed block markers
 83    (Automerge.BlockSpan blockSpan@(AutomergeBlock marker _ False)) -> (BlockNode $ buildBlockNode marker, getChildSeeds blockSpan remainingSpans)
 84    -- Embed block markers don't have children
 85    (Automerge.BlockSpan (AutomergeBlock marker _ True)) -> (BlockNode $ buildBlockNode marker, [])
 86    -- Text spans
 87    (Automerge.TextSpan textSpan) -> (InlineNode $ convertTextSpan textSpan, [])
 88  
 89  getChildSeeds :: Automerge.BlockSpan -> [Automerge.Span] -> [(Automerge.Span, [Automerge.Span])]
 90  getChildSeeds blockSpan = (addChildlessSeed . Automerge.takeUntilNonEmbedBlockSpan) <> (getChildBlockSeeds $ Just blockSpan)
 91    where
 92      addChildlessSeed = map (\x -> (x, []))
 93  
 94  getChildBlockSeeds :: Maybe Automerge.BlockSpan -> [Automerge.Span] -> [(Automerge.Span, [Automerge.Span])]
 95  getChildBlockSeeds blockSpan = addChildBlocks
 96    where
 97      addChildBlocks [] = []
 98      addChildBlocks (x : xs) = case x of
 99        -- Note that we will enter this case even when `blockSpan` is `Nothing`.
100        -- In this case, `Automerge.isParent` will return true for the top-level blocks.
101        Automerge.BlockSpan currentSpan | Automerge.isParent blockSpan currentSpan && (not $ isEmbed currentSpan) -> createChildBlockSeed currentSpan xs : addChildBlocks xs
102        _ -> addChildBlocks xs
103        where
104          createChildBlockSeed :: BlockSpan -> [Automerge.Span] -> (Automerge.Span, [Automerge.Span])
105          -- We stop seeding when we encounter a same block type sibling so that children don't get added (replicated) to all siblings
106          createChildBlockSeed blSpan restSpans = (Automerge.BlockSpan blSpan, Automerge.takeUntilNextSameBlockTypeSibling blSpan restSpans)
107  
108  buildBlockNode :: BlockMarker -> BlockNode
109  buildBlockNode marker = case marker of
110    Automerge.ParagraphMarker -> PandocBlock $ Pandoc.Para []
111    Automerge.HeadingMarker (Heading (HeadingLevel level)) -> PandocBlock $ Pandoc.Header level nullAttr []
112    Automerge.CodeBlockMarker -> PandocBlock $ Pandoc.CodeBlock nullAttr T.empty
113    Automerge.UnorderedListItemMarker -> BulletListItem
114    Automerge.OrderedListItemMarker -> OrderedListItem
115    Automerge.BlockQuoteMarker -> PandocBlock $ Pandoc.BlockQuote []
116    Automerge.NoteRefMarker (Automerge.NoteId noteId) -> NoteRef (PandocReader.NoteId noteId)
117    Automerge.NoteContentMarker (Automerge.NoteId noteId) -> NoteContent (PandocReader.NoteId noteId)
118    _ -> undefined -- more blocks to be implemented
119  
120  convertTextSpan :: Automerge.TextSpan -> Pandoc.Inlines
121  convertTextSpan = convertMarksToInlines <*> convertTextToInlines
122  
123  convertTextToInlines :: Automerge.TextSpan -> Pandoc.Inlines
124  convertTextToInlines = Pandoc.str . value
125  
126  convertMarksToInlines :: Automerge.TextSpan -> Pandoc.Inlines -> Pandoc.Inlines
127  convertMarksToInlines textSpan inlines = foldl' (flip markToInlines) inlines $ marks textSpan
128  
129  markToInlines :: Automerge.Mark -> Pandoc.Inlines -> Pandoc.Inlines
130  markToInlines mark = case mark of
131    Automerge.Strong -> Pandoc.strong
132    Automerge.Emphasis -> Pandoc.emph
133    Automerge.LinkMark automergeLink -> Pandoc.link (url automergeLink) (title automergeLink)
134    Automerge.Code -> Pandoc.code . concatStrInlines
135      where
136        concatStrInlines :: Inlines -> T.Text
137        concatStrInlines inlines = T.concat [t | Pandoc.Str t <- Pandoc.toList inlines]
138  
139  groupListItems :: Tree DocNode -> Tree DocNode
140  groupListItems = foldTree addListNodes
141    where
142      addListNodes :: DocNode -> [Tree DocNode] -> Tree DocNode
143      addListNodes node subtrees = Node node $ case node of
144        Root -> groupAdjacentListItems subtrees
145        BlockNode _ -> groupAdjacentListItems subtrees
146        InlineNode _ -> subtrees
147  
148  groupAdjacentListItems :: [Tree DocNode] -> [Tree DocNode]
149  groupAdjacentListItems = concatMap nestListItemGroupsUnderList . groupBy isAdjacentListItemNode
150    where
151      isAdjacentListItemNode :: Tree DocNode -> Tree DocNode -> Bool
152      isAdjacentListItemNode (Node (BlockNode (BulletListItem)) _) (Node (BlockNode (BulletListItem)) _) = True
153      isAdjacentListItemNode (Node (BlockNode (OrderedListItem)) _) (Node (BlockNode (OrderedListItem)) _) = True
154      isAdjacentListItemNode _ _ = False
155  
156      nestListItemGroupsUnderList :: [Tree DocNode] -> [Tree DocNode]
157      nestListItemGroupsUnderList group = case (find listItemInGroup group) of
158        -- add bullet list node
159        Just (Node (BlockNode (BulletListItem)) _) -> [Node (BlockNode $ PandocBlock $ Pandoc.BulletList []) group]
160        -- add ordered list node
161        Just (Node (BlockNode (OrderedListItem)) _) -> [Node (BlockNode $ PandocBlock $ Pandoc.OrderedList (1, DefaultStyle, DefaultDelim) []) group]
162        _ -> group
163  
164  listItemInGroup :: Tree DocNode -> Bool
165  listItemInGroup (Node (BlockNode (BulletListItem)) _) = True
166  listItemInGroup (Node (BlockNode (OrderedListItem)) _) = True
167  listItemInGroup _ = False
168  
169  mapNotesToPandocNotes :: Tree DocNode -> Tree DocNode
170  mapNotesToPandocNotes tree = pruneNonPandocNodeNotes $ replaceNoteRefsWithPandocNotes noteContentsMap tree
171    where
172      noteContentsMap = buildNoteContentsMap tree
173  
174  type NoteContentsMap = M.Map PandocReader.NoteId (Tree DocNode)
175  
176  buildNoteContentsMap :: Tree DocNode -> NoteContentsMap
177  buildNoteContentsMap subtree@(Node node children) = case node of
178    BlockNode (NoteContent noteId) -> M.insert noteId subtree childMaps
179    _ -> childMaps
180    where
181      childMaps = M.unions (map buildNoteContentsMap children)
182  
183  replaceNoteRefsWithPandocNotes :: NoteContentsMap -> Tree DocNode -> Tree DocNode
184  replaceNoteRefsWithPandocNotes noteContentsMap = replaceNoteRefs
185    where
186      replaceNoteRefs (Node (BlockNode (NoteRef noteId)) _) =
187        case M.lookup noteId noteContentsMap of
188          Just (Node _ noteContents) ->
189            Node (InlineNode (singleton $ Pandoc.Note [])) noteContents
190          Nothing ->
191            -- Leave as an orphan ref; will be pruned later
192            Node (BlockNode (NoteRef noteId)) []
193      -- Non-ref nodes remain the same. Call `replaceNoteRefs` recursively for their children.
194      replaceNoteRefs (Node node children) = Node node $ map replaceNoteRefs children
195  
196  pruneNonPandocNodeNotes :: Tree DocNode -> Tree DocNode
197  pruneNonPandocNodeNotes (Node node children) = Node node prunedChildren
198    where
199      prunedChildren = [pruneNonPandocNodeNotes child | child <- children, not (isNoteContentNode child || isNoteRefNode child)]
200        where
201          isNoteRefNode (Node (BlockNode (NoteRef _)) _) = True
202          isNoteRefNode _ = False
203  
204          isNoteContentNode (Node (BlockNode (NoteContent _)) _) = True
205          isNoteContentNode _ = False
206  
207  treeToPandocBlocks :: Tree DocNode -> Either PandocError Pandoc.Blocks
208  treeToPandocBlocks tree = sequenceA (foldTree treeNodeToPandocBlock tree) >>= getBlockSeq
209  
210  data BlockOrInlines = BlockElement Pandoc.Block | InlineElement Pandoc.Inlines
211  
212  treeNodeToPandocBlock :: DocNode -> [[Either PandocError BlockOrInlines]] -> [Either PandocError BlockOrInlines]
213  treeNodeToPandocBlock node childrenNodes = case node of
214    Root -> concat childrenNodes
215    (BlockNode (PandocBlock (Pandoc.Para _))) -> [fmap (BlockElement . Pandoc.Para . Pandoc.toList) (concatChildrenInlines childrenNodes)]
216    (BlockNode (PandocBlock (Pandoc.Header level attr _))) -> [fmap (BlockElement . Pandoc.Header level attr . Pandoc.toList) (concatChildrenInlines childrenNodes)]
217    (BlockNode (PandocBlock (Pandoc.CodeBlock attr _))) ->
218      [ do
219          inlines <- concatChildrenInlines childrenNodes
220          case firstInline inlines of
221            Just (Str text) -> Right $ BlockElement $ Pandoc.CodeBlock attr text
222            _ -> Left $ PandocSyntaxMapError "Error in mapping: Could not extract code block text"
223      ]
224    (BlockNode (BulletListItem)) -> wrapInlinesToPlain . concatAdjacentInlines $ concat childrenNodes
225    (BlockNode (OrderedListItem)) -> wrapInlinesToPlain . concatAdjacentInlines $ concat childrenNodes
226    (BlockNode (PandocBlock (Pandoc.BulletList _))) -> [fmap (BlockElement . Pandoc.BulletList) (mapToChildBlocks childrenNodes)]
227    (BlockNode (PandocBlock (Pandoc.OrderedList attrs _))) -> [fmap (BlockElement . Pandoc.OrderedList attrs) (mapToChildBlocks childrenNodes)]
228    (BlockNode (PandocBlock (Pandoc.BlockQuote _))) -> [fmap (BlockElement . Pandoc.BlockQuote) (traverseAssertingChildIsBlock . wrapInlinesToPlain . concatAdjacentInlines $ concat childrenNodes)]
229    (BlockNode (NoteRef _)) -> [Left $ PandocSyntaxMapError "Error in mapping: found unmapped or orphan note ref node"]
230    (BlockNode (NoteContent _)) -> [Left $ PandocSyntaxMapError "Error in mapping: found unmapped or orphan note content node"]
231    (InlineNode inlines) -> case Pandoc.toList inlines of
232      -- A note can have block children, so it needs different handling compared to other inline nodes.
233      -- We assume that Inline nodes contain a note only contain a single element since we are creating them in this module (see `replaceNoteRefsWithPandocNotes`).
234      [Pandoc.Note []] ->
235        [fmap (InlineElement . Pandoc.singleton . Pandoc.Note) (traverseAssertingChildIsBlock . wrapInlinesToPlain . concatAdjacentInlines $ concat childrenNodes)]
236      -- In the generic case for inlines, just wrap them with a list of a single `InlineElement`.
237      _ ->
238        [Right $ InlineElement inlines]
239    _ -> undefined
240    where
241      concatChildrenInlines :: [[Either PandocError BlockOrInlines]] -> Either PandocError Pandoc.Inlines
242      concatChildrenInlines children = concatInlines $ map (>>= assertInlines) $ concat children
243        where
244          concatInlines :: [Either PandocError Pandoc.Inlines] -> Either PandocError Pandoc.Inlines
245          concatInlines eitherInlines = fmap mconcat $ sequenceA eitherInlines
246  
247      concatAdjacentInlines :: [Either PandocError BlockOrInlines] -> [Either PandocError BlockOrInlines]
248      concatAdjacentInlines = foldr mergeOrAppendAdjacent []
249        where
250          mergeOrAppendAdjacent :: Either PandocError BlockOrInlines -> [Either PandocError BlockOrInlines] -> [Either PandocError BlockOrInlines]
251          mergeOrAppendAdjacent x [] = [x]
252          mergeOrAppendAdjacent (Right (InlineElement currentInlines)) (Right (InlineElement firstOfRestInlines) : rest) =
253            (Right (InlineElement (currentInlines <> firstOfRestInlines)) : rest)
254          mergeOrAppendAdjacent x rest = (x : rest)
255  
256      wrapInlinesToPlain :: [Either PandocError BlockOrInlines] -> [Either PandocError BlockOrInlines]
257      wrapInlinesToPlain eitherInlines = (fmap . fmap) wrapInlines eitherInlines
258        where
259          wrapInlines :: BlockOrInlines -> BlockOrInlines
260          wrapInlines (BlockElement block) = BlockElement block
261          wrapInlines (InlineElement inlines) = BlockElement $ Pandoc.Plain $ Pandoc.toList inlines
262  
263      mapToChildBlocks :: [[Either PandocError BlockOrInlines]] -> Either PandocError [[Pandoc.Block]]
264      mapToChildBlocks children = (traverse . traverse) (>>= assertBlock) children
265  
266      traverseAssertingChildIsBlock :: [Either PandocError BlockOrInlines] -> Either PandocError [Pandoc.Block]
267      traverseAssertingChildIsBlock children = traverse (>>= assertBlock) children
268  
269      firstInline :: Pandoc.Inlines -> Maybe Pandoc.Inline
270      firstInline = firstValue
271  
272  getBlockSeq :: [BlockOrInlines] -> Either PandocError Pandoc.Blocks
273  getBlockSeq = fmap Pandoc.fromList . traverse assertBlock
274  
275  assertBlock :: BlockOrInlines -> Either PandocError Pandoc.Block
276  assertBlock (BlockElement block) = Right block
277  assertBlock (InlineElement _) = Left $ PandocSyntaxMapError "Error in mapping: found orphan inline node"
278  
279  assertInlines :: BlockOrInlines -> Either PandocError Pandoc.Inlines
280  assertInlines (BlockElement _) = Left $ PandocSyntaxMapError "Error in mapping: found block node in inline node slot"
281  assertInlines (InlineElement inlines) = Right $ inlines