Text.hs
1 {-| 2 Module : Gargantext.Core.Text 3 Description : Ngrams tools 4 Copyright : (c) CNRS, 2018 5 License : AGPL + CECILL v3 6 Maintainer : team@gargantext.org 7 Stability : experimental 8 Portability : POSIX 9 10 Text gathers terms in unit of contexts. 11 12 -} 13 14 15 module Gargantext.Core.Text 16 where 17 18 import Data.Text (split) 19 import Data.Text qualified as DT 20 import Gargantext.Prelude hiding (filter) 21 import NLP.FullStop (segment) 22 import Prelude qualified 23 24 ----------------------------------------------------------------- 25 class HasText h 26 where 27 hasText :: h -> [Text] 28 29 ----------------------------------------------------------------- 30 -- French words to distinguish contexts 31 newtype Texte = Texte Text 32 newtype Paragraphe = Paragraphe Text 33 newtype Phrase = Phrase Text 34 newtype MultiTerme = MultiTerme Text 35 newtype Mot = Mot Text 36 newtype Lettre = Lettre Text 37 38 -- | Type syn seems obvious 39 type Titre = Phrase 40 41 ----------------------------------------------------------------- 42 43 instance Prelude.Show Texte where 44 show (Texte t) = show t 45 46 instance Prelude.Show Paragraphe where 47 show (Paragraphe p) = show p 48 49 instance Prelude.Show Phrase where 50 show (Phrase p) = show p 51 52 instance Prelude.Show MultiTerme where 53 show (MultiTerme mt) = show mt 54 55 instance Prelude.Show Mot where 56 show (Mot t) = show t 57 58 instance Prelude.Show Lettre where 59 show (Lettre l) = show l 60 61 ----------------------------------------------------------------- 62 63 class Collage sup inf where 64 dec :: sup -> [inf] 65 inc :: [inf] -> sup 66 67 instance Collage Texte Paragraphe where 68 dec (Texte t) = map Paragraphe $ DT.splitOn "\n" t 69 inc = Texte . DT.unlines . map (\(Paragraphe t) -> t) 70 71 instance Collage Paragraphe Phrase where 72 dec (Paragraphe t) = map Phrase $ sentences t 73 inc = Paragraphe . DT.unwords . map (\(Phrase p) -> p) 74 75 instance Collage Phrase MultiTerme where 76 dec (Phrase t) = map MultiTerme $ DT.words t 77 inc = Phrase . DT.unwords . map (\(MultiTerme p) -> p) 78 79 instance Collage MultiTerme Mot where 80 dec (MultiTerme mt) = map Mot $ DT.words mt 81 inc = MultiTerme . DT.unwords . map (\(Mot m) -> m) 82 83 ------------------------------------------------------------------- 84 -- Contexts of text 85 sentences :: Text -> [Text] 86 sentences txt = map DT.pack $ segment $ DT.unpack txt 87 88 sentences' :: Text -> [Text] 89 sentences' txt = split isCharStop txt 90 91 isCharStop :: Char -> Bool 92 isCharStop c = c `elem` ['.','?','!'] 93 94 unsentences :: [Text] -> Text 95 unsentences txts = DT.unwords txts 96 97 -- | Ngrams size 98 size :: Text -> Int 99 size t = 1 + DT.count " " t 100 101 102