CleanCsvCorpus.hs
1 {-| 2 Module : CleanCsvCorpus.hs 3 Description : Gargantext starter 4 Copyright : (c) CNRS, 2017-Present 5 License : AGPL + CECILL v3 6 Maintainer : team@gargantext.org 7 Stability : experimental 8 Portability : POSIX 9 10 Given a Gargantext CSV File and its Query This script cleans and 11 compress the contexts around the main terms of the query. 12 -} 13 14 15 module CleanCsvCorpus where 16 17 import Data.SearchEngine qualified as S 18 import Data.Set qualified as S 19 import Data.Text (pack) 20 import Data.Vector (Vector) 21 import Data.Vector qualified as V 22 import Gargantext.Core.Text.Corpus.Parsers.CSV qualified as CSV 23 import Gargantext.Core.Text.Search 24 import Gargantext.Prelude 25 ------------------------------------------------------------------------ 26 27 type Query = [S.Term] 28 29 filterDocs :: [DocId] -> Vector CSV.CsvGargV3 -> Vector CSV.CsvGargV3 30 filterDocs docIds = V.filter (\doc -> S.member (CSV.d_docId doc) $ S.fromList docIds ) 31 32 33 main :: IO () 34 main = do 35 let rPath = "/tmp/Gargantext_Corpus.csv" 36 let wPath = "/tmp/Gargantext_Corpus_bis.csv" 37 --let q = ["water", "scarcity", "morocco", "shortage","flood"] 38 let q = ["gratuit", "gratuité", "culture", "culturel"] 39 40 eDocs <- CSV.readCSVFile rPath 41 case eDocs of 42 Right (h, csvDocs) -> do 43 putStrLn ("Number of documents before:" <> show (V.length csvDocs) :: Text) 44 putStrLn ("Mean size of docs:" <> show ( CSV.docsSize csvDocs) :: Text) 45 46 let docs = CSV.toDocs csvDocs 47 let engine = S.insertDocs docs initialDocSearchEngine 48 let docIds = S.query engine (map pack q) 49 let docs' = CSV.fromDocs $ filterDocs docIds (V.fromList docs) 50 51 putStrLn ("Number of documents after:" <> show (V.length docs') :: Text) 52 putStrLn ("Mean size of docs:" <> show (CSV.docsSize docs') :: Text) 53 54 CSV.writeFile wPath (h, docs') 55 Left e -> panicTrace $ "Error: " <> e