/ src / Gargantext / Core / Text.hs
Text.hs
  1  {-|
  2  Module      : Gargantext.Core.Text
  3  Description : Ngrams tools
  4  Copyright   : (c) CNRS, 2018
  5  License     : AGPL + CECILL v3
  6  Maintainer  : team@gargantext.org
  7  Stability   : experimental
  8  Portability : POSIX
  9  
 10  Text gathers terms in unit of contexts.
 11  
 12  -}
 13  
 14  
 15  module Gargantext.Core.Text
 16    where
 17  
 18  import Data.Text (split)
 19  import Data.Text qualified as DT
 20  import Gargantext.Prelude hiding (filter)
 21  import NLP.FullStop (segment)
 22  import Prelude qualified
 23  
 24  -----------------------------------------------------------------
 25  class HasText h
 26    where
 27      hasText :: h -> [Text]
 28  
 29  -----------------------------------------------------------------
 30  -- French words to distinguish contexts
 31  newtype Texte      = Texte      Text
 32  newtype Paragraphe = Paragraphe Text
 33  newtype Phrase     = Phrase     Text
 34  newtype MultiTerme = MultiTerme Text
 35  newtype Mot        = Mot        Text
 36  newtype Lettre     = Lettre     Text
 37  
 38  -- | Type syn seems obvious
 39  type    Titre      = Phrase
 40  
 41  -----------------------------------------------------------------
 42  
 43  instance Prelude.Show Texte where
 44    show (Texte t) = show t
 45  
 46  instance Prelude.Show Paragraphe where
 47    show (Paragraphe p) = show p
 48  
 49  instance Prelude.Show Phrase where
 50    show (Phrase p) = show p
 51  
 52  instance Prelude.Show MultiTerme where
 53    show (MultiTerme mt) = show mt
 54  
 55  instance Prelude.Show Mot where
 56    show (Mot t) = show t
 57  
 58  instance Prelude.Show Lettre where
 59    show (Lettre l) = show l
 60  
 61  -----------------------------------------------------------------
 62  
 63  class Collage sup inf where
 64    dec ::  sup  -> [inf]
 65    inc :: [inf] -> sup
 66  
 67  instance Collage Texte Paragraphe where
 68    dec (Texte t) = map Paragraphe $ DT.splitOn "\n" t
 69    inc           = Texte . DT.unlines . map (\(Paragraphe t) -> t)
 70  
 71  instance Collage Paragraphe Phrase where
 72    dec (Paragraphe t) = map Phrase $ sentences t
 73    inc                = Paragraphe . DT.unwords . map (\(Phrase p) -> p)
 74  
 75  instance Collage Phrase MultiTerme where
 76    dec (Phrase t) = map MultiTerme $ DT.words t
 77    inc            = Phrase . DT.unwords . map (\(MultiTerme p) -> p)
 78  
 79  instance Collage MultiTerme Mot where
 80    dec (MultiTerme mt) = map Mot $ DT.words mt
 81    inc                 = MultiTerme . DT.unwords . map (\(Mot m) -> m)
 82  
 83  -------------------------------------------------------------------
 84  -- Contexts of text
 85  sentences :: Text -> [Text]
 86  sentences txt = map DT.pack $ segment $ DT.unpack txt
 87  
 88  sentences' :: Text -> [Text]
 89  sentences' txt = split isCharStop txt
 90  
 91  isCharStop :: Char -> Bool
 92  isCharStop c = c `elem` ['.','?','!']
 93  
 94  unsentences :: [Text] -> Text
 95  unsentences txts = DT.unwords txts
 96  
 97  -- | Ngrams size
 98  size :: Text -> Int
 99  size t = 1 + DT.count " " t
100  
101  
102