data_file_handler.py
1 # Python Imports 2 import logging 3 from pathlib import Path 4 from typing import List, Optional 5 6 import pandas as pd 7 from result import Err, Ok, Result 8 9 # Project Imports 10 from src.analysis.data.data_handler import DataHandler 11 from src.analysis.utils import file_utils 12 13 logger = logging.getLogger(__name__) 14 15 16 class DataFileHandler(DataHandler): 17 18 def __init__(self, ignore_columns: Optional[List] = None, include_files: Optional[List] = None): 19 super().__init__(ignore_columns) 20 self._include_files = include_files 21 22 def concat_dataframes_from_folders_as_mean(self, folders: List, points: int): 23 for folder in folders: 24 folder_path = Path(folder) 25 folder_df = pd.DataFrame() 26 match file_utils.get_files_from_folder_path(folder_path, self._include_files): 27 case Ok(data_files_names): 28 folder_df = self._concat_files_as_mean( 29 folder_df, data_files_names, folder_path, points 30 ) 31 folder_df["class"] = f"{folder_path.parent.name}/{folder_path.name}" 32 self._dataframe = pd.concat([self._dataframe, folder_df]) 33 case Err(error): 34 logger.error(error) 35 36 def _concat_files_as_mean( 37 self, target_df: pd.DataFrame, data_files_path: List, location: Path, points: int 38 ) -> pd.DataFrame: 39 for file_path in data_files_path: 40 match self._concat_data_as_mean_from_file(target_df, location / file_path, points): 41 case Ok(result_df): 42 logger.info(f"{file_path} added") 43 target_df = result_df 44 case Err(msg): 45 logger.error(msg) 46 47 return target_df 48 49 def _concat_data_as_mean_from_file( 50 self, target_df: pd.DataFrame, file_path: Path, points: int 51 ) -> Result[pd.DataFrame, str]: 52 if not file_path.exists(): 53 return Err(f"{file_path} cannot be dumped to memory.") 54 55 logger.info(f"Reading {file_path} with {points} datapoints") 56 file_df = pd.read_csv(file_path, parse_dates=["Time"], index_col="Time", nrows=points) 57 if len(file_df) < points: 58 logger.warning(f"Not enough datapoints in {file_path}") 59 60 target_df = self.concat_data_as_mean(target_df, file_df, file_path.name) 61 62 return Ok(target_df)