PandocReader.hs
1 {-# LANGUAGE OverloadedStrings #-} 2 3 module PandocReader (toPandoc, readAutomerge) where 4 5 import Automerge (BlockMarker (..), BlockSpan (..), Heading (..), HeadingLevel (..), Link (..), Mark (..), NoteId (..), Span (..), TextSpan (..), isEmbed, isParent, parseAutomergeSpansText, takeUntilNextSameBlockTypeSibling, takeUntilNonEmbedBlockSpan) 6 import Control.Monad ((>=>)) 7 import Control.Monad.Except (throwError) 8 import Data.List (find, groupBy) 9 import Data.List.NonEmpty (NonEmpty (..), nonEmpty, toList) 10 import qualified Data.Map as M 11 import Data.Maybe (fromMaybe) 12 import qualified Data.Text as T 13 import Data.Tree (Tree (Node), drawTree, foldTree, unfoldForest) 14 import Debug.Trace 15 import Text.Pandoc (PandocError (PandocParseError, PandocSyntaxMapError), ReaderOptions) 16 import Text.Pandoc.Builder as Pandoc 17 ( Block (..), 18 Blocks, 19 Inline (Note, Str), 20 Inlines, 21 ListNumberDelim (DefaultDelim), 22 ListNumberStyle (DefaultStyle), 23 Pandoc, 24 code, 25 doc, 26 emph, 27 fromList, 28 link, 29 nullAttr, 30 singleton, 31 str, 32 strong, 33 toList, 34 ) 35 import Text.Pandoc.Class (PandocMonad) 36 import Text.Pandoc.Sources (ToSources, sourcesToText, toSources) 37 import Utils.Sequence (firstValue) 38 39 -- Although ReaderOptions are not used, the function is written like this so that it's consistent with the other Pandoc reader functions. 40 readAutomerge :: (PandocMonad m, ToSources a) => ReaderOptions -> a -> m Pandoc 41 readAutomerge _ = 42 -- Using Kleisli composition to compose the 2 smaller functions in the monadic context (PandocMonad) 43 parseSpansAndHandleErrors >=> toPandoc 44 where 45 -- Here we parse the JSON Automerge spans but also convert a potential error to a Pandoc parsing error 46 -- The result is a list of Automerge spans wrapped within a Pandoc monad. 47 parseSpansAndHandleErrors :: (ToSources a, PandocMonad m) => a -> m [Automerge.Span] 48 parseSpansAndHandleErrors = either handleParsingErrorMessage pure . parseSpans 49 50 parseSpans :: (ToSources a) => a -> Either String [Span] 51 parseSpans = parseAutomergeSpansText . sourcesToText . toSources 52 53 handleParsingErrorMessage :: (PandocMonad m) => String -> m a 54 handleParsingErrorMessage = throwError . PandocParseError . T.pack 55 56 toPandoc :: (PandocMonad m) => [Automerge.Span] -> m Pandoc.Pandoc 57 toPandoc = (either throwError (pure . Pandoc.doc)) . convertSpansToBlocks 58 where 59 convertSpansToBlocks :: [Automerge.Span] -> Either PandocError Pandoc.Blocks 60 convertSpansToBlocks = fromMaybe (Right $ Pandoc.fromList []) . fmap treeToPandocBlocks . buildTree 61 62 newtype NoteId = NoteId T.Text deriving (Show, Eq, Ord) 63 64 data BlockNode = PandocBlock Pandoc.Block | BulletListItem | OrderedListItem | NoteRef PandocReader.NoteId | NoteContent PandocReader.NoteId deriving (Show) 65 66 data DocNode = Root | BlockNode BlockNode | InlineNode Pandoc.Inlines deriving (Show) 67 68 traceTree :: Tree DocNode -> Tree DocNode 69 traceTree tree = Debug.Trace.trace (drawTree $ fmap show tree) tree 70 71 buildTree :: [Automerge.Span] -> Maybe (Tree DocNode) 72 buildTree = (fmap (traceTree . mapNotesToPandocNotes . groupListItems . buildRawTree)) . nonEmpty 73 74 buildRawTree :: NonEmpty Automerge.Span -> Tree DocNode 75 buildRawTree spans = Node Root $ unfoldForest buildDocNode $ getTopLevelBlockSeeds spansList 76 where 77 spansList = Data.List.NonEmpty.toList spans 78 getTopLevelBlockSeeds = getChildBlockSeeds Nothing 79 80 buildDocNode :: (Automerge.Span, [Automerge.Span]) -> (DocNode, [(Automerge.Span, [Automerge.Span])]) 81 buildDocNode (currentSpan, remainingSpans) = case currentSpan of 82 -- Non-embed block markers 83 (Automerge.BlockSpan blockSpan@(AutomergeBlock marker _ False)) -> (BlockNode $ buildBlockNode marker, getChildSeeds blockSpan remainingSpans) 84 -- Embed block markers don't have children 85 (Automerge.BlockSpan (AutomergeBlock marker _ True)) -> (BlockNode $ buildBlockNode marker, []) 86 -- Text spans 87 (Automerge.TextSpan textSpan) -> (InlineNode $ convertTextSpan textSpan, []) 88 89 getChildSeeds :: Automerge.BlockSpan -> [Automerge.Span] -> [(Automerge.Span, [Automerge.Span])] 90 getChildSeeds blockSpan = (addChildlessSeed . Automerge.takeUntilNonEmbedBlockSpan) <> (getChildBlockSeeds $ Just blockSpan) 91 where 92 addChildlessSeed = map (\x -> (x, [])) 93 94 getChildBlockSeeds :: Maybe Automerge.BlockSpan -> [Automerge.Span] -> [(Automerge.Span, [Automerge.Span])] 95 getChildBlockSeeds blockSpan = addChildBlocks 96 where 97 addChildBlocks [] = [] 98 addChildBlocks (x : xs) = case x of 99 -- Note that we will enter this case even when `blockSpan` is `Nothing`. 100 -- In this case, `Automerge.isParent` will return true for the top-level blocks. 101 Automerge.BlockSpan currentSpan | Automerge.isParent blockSpan currentSpan && (not $ isEmbed currentSpan) -> createChildBlockSeed currentSpan xs : addChildBlocks xs 102 _ -> addChildBlocks xs 103 where 104 createChildBlockSeed :: BlockSpan -> [Automerge.Span] -> (Automerge.Span, [Automerge.Span]) 105 -- We stop seeding when we encounter a same block type sibling so that children don't get added (replicated) to all siblings 106 createChildBlockSeed blSpan restSpans = (Automerge.BlockSpan blSpan, Automerge.takeUntilNextSameBlockTypeSibling blSpan restSpans) 107 108 buildBlockNode :: BlockMarker -> BlockNode 109 buildBlockNode marker = case marker of 110 Automerge.ParagraphMarker -> PandocBlock $ Pandoc.Para [] 111 Automerge.HeadingMarker (Heading (HeadingLevel level)) -> PandocBlock $ Pandoc.Header level nullAttr [] 112 Automerge.CodeBlockMarker -> PandocBlock $ Pandoc.CodeBlock nullAttr T.empty 113 Automerge.UnorderedListItemMarker -> BulletListItem 114 Automerge.OrderedListItemMarker -> OrderedListItem 115 Automerge.BlockQuoteMarker -> PandocBlock $ Pandoc.BlockQuote [] 116 Automerge.NoteRefMarker (Automerge.NoteId noteId) -> NoteRef (PandocReader.NoteId noteId) 117 Automerge.NoteContentMarker (Automerge.NoteId noteId) -> NoteContent (PandocReader.NoteId noteId) 118 _ -> undefined -- more blocks to be implemented 119 120 convertTextSpan :: Automerge.TextSpan -> Pandoc.Inlines 121 convertTextSpan = convertMarksToInlines <*> convertTextToInlines 122 123 convertTextToInlines :: Automerge.TextSpan -> Pandoc.Inlines 124 convertTextToInlines = Pandoc.str . value 125 126 convertMarksToInlines :: Automerge.TextSpan -> Pandoc.Inlines -> Pandoc.Inlines 127 convertMarksToInlines textSpan inlines = foldl' (flip markToInlines) inlines $ marks textSpan 128 129 markToInlines :: Automerge.Mark -> Pandoc.Inlines -> Pandoc.Inlines 130 markToInlines mark = case mark of 131 Automerge.Strong -> Pandoc.strong 132 Automerge.Emphasis -> Pandoc.emph 133 Automerge.LinkMark automergeLink -> Pandoc.link (url automergeLink) (title automergeLink) 134 Automerge.Code -> Pandoc.code . concatStrInlines 135 where 136 concatStrInlines :: Inlines -> T.Text 137 concatStrInlines inlines = T.concat [t | Pandoc.Str t <- Pandoc.toList inlines] 138 139 groupListItems :: Tree DocNode -> Tree DocNode 140 groupListItems = foldTree addListNodes 141 where 142 addListNodes :: DocNode -> [Tree DocNode] -> Tree DocNode 143 addListNodes node subtrees = Node node $ case node of 144 Root -> groupAdjacentListItems subtrees 145 BlockNode _ -> groupAdjacentListItems subtrees 146 InlineNode _ -> subtrees 147 148 groupAdjacentListItems :: [Tree DocNode] -> [Tree DocNode] 149 groupAdjacentListItems = concatMap nestListItemGroupsUnderList . groupBy isAdjacentListItemNode 150 where 151 isAdjacentListItemNode :: Tree DocNode -> Tree DocNode -> Bool 152 isAdjacentListItemNode (Node (BlockNode (BulletListItem)) _) (Node (BlockNode (BulletListItem)) _) = True 153 isAdjacentListItemNode (Node (BlockNode (OrderedListItem)) _) (Node (BlockNode (OrderedListItem)) _) = True 154 isAdjacentListItemNode _ _ = False 155 156 nestListItemGroupsUnderList :: [Tree DocNode] -> [Tree DocNode] 157 nestListItemGroupsUnderList group = case (find listItemInGroup group) of 158 -- add bullet list node 159 Just (Node (BlockNode (BulletListItem)) _) -> [Node (BlockNode $ PandocBlock $ Pandoc.BulletList []) group] 160 -- add ordered list node 161 Just (Node (BlockNode (OrderedListItem)) _) -> [Node (BlockNode $ PandocBlock $ Pandoc.OrderedList (1, DefaultStyle, DefaultDelim) []) group] 162 _ -> group 163 164 listItemInGroup :: Tree DocNode -> Bool 165 listItemInGroup (Node (BlockNode (BulletListItem)) _) = True 166 listItemInGroup (Node (BlockNode (OrderedListItem)) _) = True 167 listItemInGroup _ = False 168 169 mapNotesToPandocNotes :: Tree DocNode -> Tree DocNode 170 mapNotesToPandocNotes tree = pruneNonPandocNodeNotes $ replaceNoteRefsWithPandocNotes noteContentsMap tree 171 where 172 noteContentsMap = buildNoteContentsMap tree 173 174 type NoteContentsMap = M.Map PandocReader.NoteId (Tree DocNode) 175 176 buildNoteContentsMap :: Tree DocNode -> NoteContentsMap 177 buildNoteContentsMap subtree@(Node node children) = case node of 178 BlockNode (NoteContent noteId) -> M.insert noteId subtree childMaps 179 _ -> childMaps 180 where 181 childMaps = M.unions (map buildNoteContentsMap children) 182 183 replaceNoteRefsWithPandocNotes :: NoteContentsMap -> Tree DocNode -> Tree DocNode 184 replaceNoteRefsWithPandocNotes noteContentsMap = replaceNoteRefs 185 where 186 replaceNoteRefs (Node (BlockNode (NoteRef noteId)) _) = 187 case M.lookup noteId noteContentsMap of 188 Just (Node _ noteContents) -> 189 Node (InlineNode (singleton $ Pandoc.Note [])) noteContents 190 Nothing -> 191 -- Leave as an orphan ref; will be pruned later 192 Node (BlockNode (NoteRef noteId)) [] 193 -- Non-ref nodes remain the same. Call `replaceNoteRefs` recursively for their children. 194 replaceNoteRefs (Node node children) = Node node $ map replaceNoteRefs children 195 196 pruneNonPandocNodeNotes :: Tree DocNode -> Tree DocNode 197 pruneNonPandocNodeNotes (Node node children) = Node node prunedChildren 198 where 199 prunedChildren = [pruneNonPandocNodeNotes child | child <- children, not (isNoteContentNode child || isNoteRefNode child)] 200 where 201 isNoteRefNode (Node (BlockNode (NoteRef _)) _) = True 202 isNoteRefNode _ = False 203 204 isNoteContentNode (Node (BlockNode (NoteContent _)) _) = True 205 isNoteContentNode _ = False 206 207 treeToPandocBlocks :: Tree DocNode -> Either PandocError Pandoc.Blocks 208 treeToPandocBlocks tree = sequenceA (foldTree treeNodeToPandocBlock tree) >>= getBlockSeq 209 210 data BlockOrInlines = BlockElement Pandoc.Block | InlineElement Pandoc.Inlines 211 212 treeNodeToPandocBlock :: DocNode -> [[Either PandocError BlockOrInlines]] -> [Either PandocError BlockOrInlines] 213 treeNodeToPandocBlock node childrenNodes = case node of 214 Root -> concat childrenNodes 215 (BlockNode (PandocBlock (Pandoc.Para _))) -> [fmap (BlockElement . Pandoc.Para . Pandoc.toList) (concatChildrenInlines childrenNodes)] 216 (BlockNode (PandocBlock (Pandoc.Header level attr _))) -> [fmap (BlockElement . Pandoc.Header level attr . Pandoc.toList) (concatChildrenInlines childrenNodes)] 217 (BlockNode (PandocBlock (Pandoc.CodeBlock attr _))) -> 218 [ do 219 inlines <- concatChildrenInlines childrenNodes 220 case firstInline inlines of 221 Just (Str text) -> Right $ BlockElement $ Pandoc.CodeBlock attr text 222 _ -> Left $ PandocSyntaxMapError "Error in mapping: Could not extract code block text" 223 ] 224 (BlockNode (BulletListItem)) -> wrapInlinesToPlain . concatAdjacentInlines $ concat childrenNodes 225 (BlockNode (OrderedListItem)) -> wrapInlinesToPlain . concatAdjacentInlines $ concat childrenNodes 226 (BlockNode (PandocBlock (Pandoc.BulletList _))) -> [fmap (BlockElement . Pandoc.BulletList) (mapToChildBlocks childrenNodes)] 227 (BlockNode (PandocBlock (Pandoc.OrderedList attrs _))) -> [fmap (BlockElement . Pandoc.OrderedList attrs) (mapToChildBlocks childrenNodes)] 228 (BlockNode (PandocBlock (Pandoc.BlockQuote _))) -> [fmap (BlockElement . Pandoc.BlockQuote) (traverseAssertingChildIsBlock . wrapInlinesToPlain . concatAdjacentInlines $ concat childrenNodes)] 229 (BlockNode (NoteRef _)) -> [Left $ PandocSyntaxMapError "Error in mapping: found unmapped or orphan note ref node"] 230 (BlockNode (NoteContent _)) -> [Left $ PandocSyntaxMapError "Error in mapping: found unmapped or orphan note content node"] 231 (InlineNode inlines) -> case Pandoc.toList inlines of 232 -- A note can have block children, so it needs different handling compared to other inline nodes. 233 -- We assume that Inline nodes contain a note only contain a single element since we are creating them in this module (see `replaceNoteRefsWithPandocNotes`). 234 [Pandoc.Note []] -> 235 [fmap (InlineElement . Pandoc.singleton . Pandoc.Note) (traverseAssertingChildIsBlock . wrapInlinesToPlain . concatAdjacentInlines $ concat childrenNodes)] 236 -- In the generic case for inlines, just wrap them with a list of a single `InlineElement`. 237 _ -> 238 [Right $ InlineElement inlines] 239 _ -> undefined 240 where 241 concatChildrenInlines :: [[Either PandocError BlockOrInlines]] -> Either PandocError Pandoc.Inlines 242 concatChildrenInlines children = concatInlines $ map (>>= assertInlines) $ concat children 243 where 244 concatInlines :: [Either PandocError Pandoc.Inlines] -> Either PandocError Pandoc.Inlines 245 concatInlines eitherInlines = fmap mconcat $ sequenceA eitherInlines 246 247 concatAdjacentInlines :: [Either PandocError BlockOrInlines] -> [Either PandocError BlockOrInlines] 248 concatAdjacentInlines = foldr mergeOrAppendAdjacent [] 249 where 250 mergeOrAppendAdjacent :: Either PandocError BlockOrInlines -> [Either PandocError BlockOrInlines] -> [Either PandocError BlockOrInlines] 251 mergeOrAppendAdjacent x [] = [x] 252 mergeOrAppendAdjacent (Right (InlineElement currentInlines)) (Right (InlineElement firstOfRestInlines) : rest) = 253 (Right (InlineElement (currentInlines <> firstOfRestInlines)) : rest) 254 mergeOrAppendAdjacent x rest = (x : rest) 255 256 wrapInlinesToPlain :: [Either PandocError BlockOrInlines] -> [Either PandocError BlockOrInlines] 257 wrapInlinesToPlain eitherInlines = (fmap . fmap) wrapInlines eitherInlines 258 where 259 wrapInlines :: BlockOrInlines -> BlockOrInlines 260 wrapInlines (BlockElement block) = BlockElement block 261 wrapInlines (InlineElement inlines) = BlockElement $ Pandoc.Plain $ Pandoc.toList inlines 262 263 mapToChildBlocks :: [[Either PandocError BlockOrInlines]] -> Either PandocError [[Pandoc.Block]] 264 mapToChildBlocks children = (traverse . traverse) (>>= assertBlock) children 265 266 traverseAssertingChildIsBlock :: [Either PandocError BlockOrInlines] -> Either PandocError [Pandoc.Block] 267 traverseAssertingChildIsBlock children = traverse (>>= assertBlock) children 268 269 firstInline :: Pandoc.Inlines -> Maybe Pandoc.Inline 270 firstInline = firstValue 271 272 getBlockSeq :: [BlockOrInlines] -> Either PandocError Pandoc.Blocks 273 getBlockSeq = fmap Pandoc.fromList . traverse assertBlock 274 275 assertBlock :: BlockOrInlines -> Either PandocError Pandoc.Block 276 assertBlock (BlockElement block) = Right block 277 assertBlock (InlineElement _) = Left $ PandocSyntaxMapError "Error in mapping: found orphan inline node" 278 279 assertInlines :: BlockOrInlines -> Either PandocError Pandoc.Inlines 280 assertInlines (BlockElement _) = Left $ PandocSyntaxMapError "Error in mapping: found block node in inline node slot" 281 assertInlines (InlineElement inlines) = Right $ inlines