/ bin / gargantext-cli / CleanCsvCorpus.hs
CleanCsvCorpus.hs
 1  {-|
 2  Module      : CleanCsvCorpus.hs
 3  Description : Gargantext starter
 4  Copyright   : (c) CNRS, 2017-Present
 5  License     : AGPL + CECILL v3
 6  Maintainer  : team@gargantext.org
 7  Stability   : experimental
 8  Portability : POSIX
 9  
10  Given a Gargantext CSV File and its Query This script cleans and
11  compress the contexts around the main terms of the query.
12  -}
13  
14  
15  module CleanCsvCorpus  where
16  
17  import Data.SearchEngine qualified as S
18  import Data.Set qualified as S
19  import Data.Text (pack)
20  import Data.Vector (Vector)
21  import Data.Vector qualified as V
22  import Gargantext.Core.Text.Corpus.Parsers.CSV qualified as CSV
23  import Gargantext.Core.Text.Search
24  import Gargantext.Prelude
25  ------------------------------------------------------------------------
26  
27  type Query = [S.Term]
28  
29  filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3
30  filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds )
31  
32  
33  main :: IO ()
34  main = do
35    let rPath = "/tmp/Gargantext_Corpus.csv"
36    let wPath = "/tmp/Gargantext_Corpus_bis.csv"
37    --let q = ["water", "scarcity", "morocco", "shortage","flood"]
38    let q = ["gratuit", "gratuité", "culture", "culturel"]
39  
40    eDocs <- CSV.readCSVFile rPath
41    case eDocs of
42      Right (h, csvDocs) -> do
43        putStrLn ("Number of documents before:" <> show (V.length csvDocs) :: Text)
44        putStrLn ("Mean size of docs:" <> show ( CSV.docsSize csvDocs) :: Text)
45  
46        let docs   = CSV.toDocs csvDocs
47        let engine = S.insertDocs docs initialDocSearchEngine
48        let docIds = S.query engine (map pack q)
49        let docs'  = CSV.fromDocs $ filterDocs docIds (V.fromList docs)
50  
51        putStrLn ("Number of documents after:" <> show (V.length docs') :: Text)
52        putStrLn ("Mean size of docs:" <> show (CSV.docsSize docs') :: Text)
53  
54        CSV.writeFile wPath (h, docs')
55      Left e -> panicTrace $ "Error: " <> e