Source code for LOGS_solutions.GenerateStatistics.StatisticEntities.StatisticsDatasets

import logging
import sys
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List

from LOGS.Entities import (
    Dataset,
    DatasetRequestParameter,
    InventoryItem,
    InventoryItemRequestParameter,
    CustomFieldDataType,
)
from LOGS.LOGS import LOGS

from .StatisticHandlerEntities import StatisticHandlerEntities



[docs]
class StatisticsDatasets(StatisticHandlerEntities):
    """
    Class for creating the statistics for the data sets.
    Includes the following statistics:
    How many data sets were created per time unit (day, week, month, year). The creation date is used.
    The statistics are output per
    - LOGS group
    - Person (or filtered by a specific person)
    - Inventory (or filtered by a specific inventory item)

    The result is a CSV file per person, logs-group and inventory and a pdf per logs-group, person and inventory.
    """

    def __init__(
        self,
        logs: LOGS,
        inventories: List,
        target_path: str = "./statistics",
        begin_date: datetime = None,
        end_date: datetime = None,
        show_num: bool = True,
        persons: List = [],
        inventory_items: List = [],
    ):
        """Initialization.

        :param logs: LOGS object to access the LOGS web API,
        :param inventories: List of inventories to be included in the statistics. Have to be specified by their IDs.
        :param target_path: The target path, where all statistics should be saved.
        Default: Within the folder containing the script, a new folder "statistics"
        is created in which all statistics are saved.
        :param begin_date: Lowest date limit for statistics to be created.
        :param end_date: Highest date limit for statistics to be created.
        :param show_num: Boolean to show the number of data sets in the heatmap.
        Default: True
        :param persons: List of persons to be included in the statistics.
        Default: empty list -> all persons are included.
        :param inventory_items: List of inventory items to be included in the statistics. Have to be specified by their IDs.
        Default: empty list -> all inventory items are included.
        """

        self._logger_datasets = logging.getLogger("StatisticDatasets")

        self._logger_datasets.setLevel(logging.INFO)

        logfile_folder = Path(__file__).resolve().parent / "logfiles"
        logfile_folder.mkdir(parents=True, exist_ok=True)
        if not self._logger_datasets.hasHandlers():
            formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")

            logconsole_handler = logging.StreamHandler(sys.stdout)
            logconsole_handler.setLevel(logging.INFO)
            logconsole_handler.setFormatter(formatter)
            self._logger_datasets.addHandler(logconsole_handler)

        super().__init__(logs, begin_date, end_date, target_path, self._logger_datasets)
        self.__dataset_path = self._target_path / "dataset"
        self.__show_num = show_num if isinstance(show_num, bool) else True
        self._persons = self._validate_list(persons)
        self._inventories = self._validate_list(inventories)
        self._inventory_items = self._validate_list(inventory_items)

        if self._begin_date is None:
            self._begin_date = (
                self._logs.datasets(DatasetRequestParameter(sortBy="CREATION_DATE"))
                .first()
                .creationDate
            )
            self._begin_date = self._begin_date.replace(
                hour=0, minute=0, second=0, microsecond=0
            )
        if self._end_date is None:
            datasets_list = list(
                self._logs.datasets(DatasetRequestParameter(sortBy="CREATION_DATE"))
            )
            self._end_date = (
                datasets_list[-1].creationDate if datasets_list else datetime.now()
            )
            self._end_date = (self._end_date + timedelta(days=1)).replace(
                hour=0, minute=0, second=0, microsecond=0
            )
        self._custom_fields = {}
        self._inventory_types = []
        print(self._begin_date, self._end_date)


[docs]
    def update_inventory_dict(
        self, dataset: Dataset, dataset_inventory_dict: Dict
    ) -> Dict:
        """Updates the dictionary of inventories based on the provided dataset.
        The dictionary is updated with the inventory associated with the data
        set. The creation date is added to the associated list in the
        dictionary.

        :param dataset: The dataset from which inventory details and creation dates are extracted.
        :param dataset_inventory_dict: A dictionary mapping inventory IDs to lists that contain the
                                        inventory's name and previously recorded creation dates.
                                        Structure: {inventory_id: [inventory_name, ...creation_dates]}

        :return: An updated dictionary where each key is an inventory ID and each value is a list with the
                inventory name as the first element followed by all creation dates (both existing and newly added).
                Structure: {inventory_id: [inventory_name, creation_date1, creation_date2, ...]}
        """

        if dataset.customValues is None:
            if dataset_inventory_dict.get(0) is None:
                dataset_inventory_dict[0] = [
                    f"No Inventory Item of Inventory with ID {self._inventories}"
                ]
            dataset_inventory_dict[0].append(dataset.creationDate)
            return dataset_inventory_dict

        if dataset.customValues.customFields is None:
            if dataset_inventory_dict.get(0) is None:
                dataset_inventory_dict[0] = [
                    f"No Inventory Item of Inventory with ID {self._inventories}"
                ]
            dataset_inventory_dict[0].append(dataset.creationDate)
            return dataset_inventory_dict

        for f in dataset.customValues.customFields:
            if f.dataType == CustomFieldDataType.InventoryItem and f.value is not None:
                if f.id not in self._custom_fields:
                    q = f.value.fetchFullEntity()
                    self._custom_fields[q.id] = q
                else:
                    q = self._custom_fields[q.id]
                if q.customType.id in self._inventories:
                    if q.customType.name not in self._inventory_types:
                        self._inventory_types.append(q.customType.name)
                    if q.id in self._inventory_items or self._inventory_items == []:
                        if dataset_inventory_dict.get(q.id) is None:
                            dataset_inventory_dict[q.id] = [q.name]
                        dataset_inventory_dict[q.id].append(dataset.creationDate)
                    else:
                        continue
                if q.customType.id not in self._inventories:
                    if dataset_inventory_dict.get(0) is None:
                        dataset_inventory_dict[0] = [
                            f"No Inventory Item of Inventory with ID {self._inventories}"
                        ]
                    dataset_inventory_dict[0].append(dataset.creationDate)
        return dataset_inventory_dict



[docs]
    def update_person_dict_dataset(
        self, dataset: Dataset, dataset_person_dict: Dict
    ) -> Dict:
        """Updates the dictionary of persons based on the provided dataset. The
        dictionary is updated with the owner of the data set. The creation
        date is added to the associated list in the dictionary.

        :param dataset: The dataset from which person details and creation dates are extracted.
        :param dataset_person_dict: A dictionary mapping person IDs to lists that contain the
                                        person's name and previously recorded creation dates.
                                        Structure: {person-id: [person-name, ...creation_dates]}

        :return: An updated dictionary where each key is a person ID and each value is a list with the
                person name as the first element followed by all creation dates (both existing and newly added).
                Structure: {person_id: [person_name, creation_date1, creation_date2, ...]}
        """

        if dataset.owner is None:
            if not self._persons or 0 in self._persons:
                if 0 not in dataset_person_dict:
                    dataset_person_dict[0] = ["No Person"]
                dataset_person_dict[0].append(dataset.creationDate)
        else:
            if self._persons and dataset.owner.id not in self._persons:
                return dataset_person_dict
            if dataset.owner.id not in dataset_person_dict:
                dataset_person_dict[dataset.owner.id] = [dataset.owner.name]

            dataset_person_dict[dataset.owner.id].append(dataset.creationDate)

        return dataset_person_dict



[docs]
    def create_statistic(self):
        """
        Generates the statistics for the datasets.
        The statistics include:
        How many projects, samples and data sets were created per time unit (day, week, month, year). The creation date is used.
        The statistics are output per
        - LOGS group
        - Person (or filtered by a specific person)
        - Inventory (or filtered by a specific inventory) (only for statistic of data sets)

        The result is a CSV file per person, logs group and inventory and a pdf per logs group, person and inventory.
        """

        self._logger_datasets.info("Starting to generate statistics for datasets.")

        # Dictionary for statistic of persons
        dataset_person_dict = {}
        # Dictionary for statistic of inventories
        dataset_inventory_dict = {}
        # List for statistic of LOGS logs group, it includes the creation date of each dataset
        datasets_filtered_list = []
        # Count the number of datasets in the given time frame for process informations
        datasets_total = self._logs.datasets(
            DatasetRequestParameter(
                creationDateFrom=self._begin_date,
                creationDateTo=self._end_date,
            )
        ).count

        # Check if there are datasets in the given time frame
        if datasets_total == 0:
            self._logger_datasets.info("No datasets found in the given time frame.")
            return

        self._logger_datasets.info(
            "Processing datasets in the given time frame: begin date: %s - end date: %s.",
            self._begin_date,
            self._end_date,
        )
        count = 0  # Counter for the number of processed datasets
        for dataset in self._logs.datasets(
            DatasetRequestParameter(
                creationDateFrom=self._begin_date, creationDateTo=self._end_date
            )
        ):
            # Skip datasets with invalid creation date
            tz = dataset.creationDate.tzinfo
            if (
                (dataset.creationDate is None)
                or (datetime(1677, 9, 21, tzinfo=tz) >= dataset.creationDate)
                or (dataset.creationDate >= datetime(2262, 4, 11, tzinfo=tz))
            ):
                self._logger_datasets.warning(
                    "Dataset %d has invalid creation date: %s. Dataset will not be included in the statistics.",
                    dataset.id,
                    dataset.creationDate,
                )
                continue

            datasets_filtered_list.append(dataset.creationDate)

            # Inventory
            dataset_inventory_dict = self.update_inventory_dict(
                dataset, dataset_inventory_dict
            )

            # Person
            dataset_person_dict = self.update_person_dict_dataset(
                dataset, dataset_person_dict
            )

            if count % 10000 == 0 and count != 0:
                self._logger_datasets.info(
                    "%d/%d datasets processed.",
                    count,
                    datasets_total,
                )
            count += 1

        self._logger_datasets.info("Finished processing datasets.")

        # Sort list by creation date
        datasets_sorted_list = sorted(datasets_filtered_list)
        ## Sort the list of data sets of the individual inventories by date
        for ins_key, value in dataset_inventory_dict.items():
            sorted_list = sorted(value[1:])
            dataset_inventory_sorted_list = [value[0]] + sorted_list
            dataset_inventory_dict[ins_key] = dataset_inventory_sorted_list

        ## Sort the list of data sets of the individual persons by date
        for ins_key, value in dataset_person_dict.items():
            sorted_list = sorted(value[1:])
            dataset_person_sorted_list = [value[0]] + sorted_list
            dataset_person_dict[ins_key] = dataset_person_sorted_list

        ### Plot statistic of LOGS logs-group and write it in a PDF.
        path_logs_group = self.__dataset_path / "logs_group"
        self.create_plot_list(
            datasets_sorted_list,
            path_logs_group,
            "datasets",
            "logs-group",
            csv_bool=True,
            show_num=self.__show_num,
        )

        ### Plot statistic of inventories and write it in a PDF.
        print("Inventory types:", self._inventory_types)
        path_inventory = self.__dataset_path / "inventory"
        self.create_plot_of_dict(
            dataset_inventory_dict,
            path_inventory,
            "datasets",
            f"inventory item of {', '.join(self._inventory_types)}",
            csv_bool=True,
            show_num=self.__show_num,
        )

        ### Plot statistic of persons and write it in a PDF.
        path_person = self.__dataset_path / "person"
        self.create_plot_of_dict(
            dataset_person_dict,
            path_person,
            "datasets",
            "person",
            csv_bool=True,
            show_num=self.__show_num,
        )

        self._logger_datasets.info("Finished generating statistics for datasets.")