Source code for LOGS_solutions.DownloadDatasets.DatasetDownloader

#!/usr/bin/env python3

import json
import os
from typing import Optional
from pathlib import Path
from datetime import datetime, timedelta
import tempfile

from ..DownloadDatasets.DatasetHandler import DatasetHandler
from ..DownloadDatasets.ProgressBar import ProgressBar

from LOGS.Entities import (
    Dataset,
    DatasetRequestParameter,
    DatasetOrder,
    Project,
    ProjectRequestParameter,
)
from LOGS.LOGS import LOGS



[docs]
class DatasetDownloader:
    """Downloads all datasets."""

    def __init__(
        self,
        logs: LOGS,
        args,
    ):
        """Initialization.

        :param logs: LOGS object to access the LOGS web API
        :param target_path: Path where all datasets should be saved.
        :param project_id: List of project ids which should be
            downloaded
        :param include_metadata: True if metadata of datasets, projects
            and samples should be saved.
        :param duplicate_handling: How datasets with the same name
            should be handled. 1: rename, 2: overwrite, 3: take first.
        :param symlink_path: Path where the datasets should be sorted by
            the format with symlinks to the original datasets. If None,
            no symlinks will be created.
        :param include_sample_projects: False if projects should be
            taken from datasets only. If True, all projects with
            datasets will be downloaded.
        """

        self._logs = logs
        self._target_path = args.target_path
        self._project_ids = args.project_ids
        self._include_metadata = args.include_metadata
        self._duplicate_handling = args.duplicate_handling
        self._symlink_path = args.symlink_path
        self._include_sample_projects = args.include_sample_projects
        self._start_from = self.valid_start_from()

        if self._symlink_path is not None:
            if self.can_create_symlink() is False:
                print(
                    "Error: Symbolic links are not supported on this system. They will not be created."
                )
                self._symlink_path = None


[docs]
    def can_create_symlink(self) -> bool:
        """Check if the system supports creating symbolic links.

        :return: True if the system supports creating symbolic links,
            False otherwise.
        """

        with tempfile.TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            target = tmp_path / "dummy.txt"
            link = tmp_path / "link.txt"

            try:
                # Dummy-Zieldatei anlegen
                target.write_text("symlink test")

                # Versuche symbolischen Link zu erstellen
                link.symlink_to(target)

                # Prüfen, ob der Link korrekt erstellt wurde
                return link.is_symlink() and link.read_text() == "symlink test"

            except (OSError, NotImplementedError):
                return False



[docs]
    def valid_start_from(self) -> Optional[datetime]:
        """Check if last_state.json exists and return the dateAdded if it
        exists, else return None.

        :return: datetime object of the last dateAdded or None
        """
        if os.path.exists("last_state.json"):
            with open("last_state.json", "r", encoding="utf-8") as stat:
                data = json.load(stat)

            if isinstance(data, dict):
                if "acquisitionDate" in data:
                    self.startFrom = datetime.fromisoformat(
                        data["acquisitionDate"]
                    ) - timedelta(seconds=1)
                    return self.startFrom
        else:
            return None



[docs]
    def download_datasets(
        self, project_path: str, sample_path: str, project: Project, dataset: Dataset
    ):
        """Parse and download datasets. Create symlinks if symlink_path is not
        None. Create metadata files if include_metadata is True.

        :param project_path: Path where the project should be saved.
        :param sample_path: Path where the sample should be saved.
        :param project: Project object
        :param dataset: Dataset object
        """

        sample_path.mkdir(parents=True, exist_ok=True)
        dataset_handler = DatasetHandler(
            dataset_target_path=sample_path,
            dataset=dataset,
            include_metadata=self._include_metadata,
            duplicate_handling=self._duplicate_handling,
            symlink_path=self._symlink_path,
            original_target_path=self._target_path,
        )
        dataset_handler.parse_dataset()

        # create symlink if symlink_path is not None
        if self._symlink_path is not None:
            dataset_handler.create_symlink()

        ProgressBar.update_processed_files()

        if self._include_metadata:
            project_information = self._logs.project(project.id).toJson()
            project_info_path = project_path / "project_information.json"

            if not project_info_path.exists():
                with open(project_info_path, "w", encoding="utf-8") as file:
                    json.dump(
                        project_information,
                        file,
                        ensure_ascii=False,
                        indent=4,
                    )

            if dataset.sample is not None:
                sample_information = self._logs.sample(dataset.sampleId).toJson()
                sample_info_path = sample_path / "sample_information.json"

                if not sample_info_path.exists():
                    with open(sample_info_path, "w", encoding="utf-8") as file:
                        json.dump(
                            sample_information,
                            file,
                            ensure_ascii=False,
                            indent=4,
                        )



[docs]
    def download_datasets_structured(self):
        """Downloads all datasets structured in the given path."""

        ProgressBar.start_progressbar("Downloading datasets")

        # Parse claimed datasets
        for dataset in self._logs.datasets(
            DatasetRequestParameter(
                isClaimed=True,
                orderby=DatasetOrder("ACQUISITION_DATE_ASC"),
                acquisitionDateFrom=self._start_from,
            )
        ):
            dataset_projects_ids = [project.id for project in dataset.projects]
            with open(Path("./last_state.json"), "w", encoding="utf-8") as stat:
                json.dump(
                    {"acquisitionDate": dataset.acquisitionDate.isoformat()}, stat
                )  # Save last date state in case of interruption

            target_path = self._target_path / "Claimed"
            for project in dataset.projects:
                # If project ids are given, only download datasets from this projects, else download all projects
                if len(self._project_ids) > 0:
                    if project.id in self._project_ids:
                        project_path = target_path / project.name
                        sample_path = project_path / dataset.sample.name
                        self.download_datasets(
                            project_path,
                            sample_path,
                            project,
                            dataset,
                        )

                else:
                    project_path = target_path / project.name
                    sample_path = project_path / dataset.sample.name
                    self.download_datasets(
                        project_path,
                        sample_path,
                        project,
                        dataset,
                    )

            # If include_sample_projects is True, download also projects of samples
            if self._include_sample_projects:
                for project in self._logs.projects(
                    ProjectRequestParameter(sampleIds=[dataset.sampleId])
                ):
                    if project.id not in dataset_projects_ids:
                        # If project ids are given, only download datasets from this projects, else download all projects
                        if len(self._project_ids) > 0:
                            if project.id in self._project_ids:
                                project_path = target_path / project.name
                                sample_path = project_path / dataset.sample.name
                                self.download_datasets(
                                    project_path,
                                    sample_path,
                                    project,
                                    dataset,
                                )
                        else:
                            project_path = target_path / project.name
                            sample_path = project_path / dataset.sample.name
                            self.download_datasets(
                                project_path,
                                sample_path,
                                project,
                                dataset,
                            )

        # Parse unclaimed datasets
        for dataset in self._logs.datasets(
            DatasetRequestParameter(
                isClaimed=False,
                orderby=DatasetOrder("ACQUISITION_DATE_ASC"),
                acquisitionDateFrom=self._start_from,
            )
        ):
            dataset_projects_ids = (
                [project.id for project in dataset.projects]
                if dataset.projects is not None
                else []
            )

            with open(Path("./last_state.json"), "w", encoding="utf-8") as stat:
                json.dump(
                    {"acquisitionDate": dataset.acquisitionDate.isoformat()}, stat
                )  # Save last date state in case of interruption

            target_path = self._target_path / "Unclaimed"

            if dataset.projects is not None:
                for project in dataset.projects:
                    # If project ids are given, only download datasets from this projects, else download all projects
                    if len(self._project_ids) > 0:
                        if project.id in self._project_ids:
                            project_path = target_path / project.name
                            if dataset.sample is not None:
                                sample_path = project_path / dataset.sample.name
                            else:
                                sample_path = project_path / "NoSample"

                            self.download_datasets(
                                project_path,
                                sample_path,
                                project,
                                dataset,
                            )

                    else:
                        project_path = target_path / project.name
                        if dataset.sample is not None:
                            sample_path = project_path / dataset.sample.name
                        else:
                            sample_path = project_path / "NoSample"

                        self.download_datasets(
                            project_path,
                            sample_path,
                            project,
                            dataset,
                        )

            if self._include_sample_projects:
                if dataset.sample is not None:
                    for project in self._logs.projects(
                        ProjectRequestParameter(sampleIds=[dataset.sampleId])
                    ):
                        if project.id not in dataset_projects_ids:
                            # If project ids are given, only download datasets from this projects, else download all projects
                            if len(self._project_ids) > 0:
                                if project.id in self._project_ids:
                                    project_path = target_path / project.name
                                    sample_path = project_path / dataset.sample.name

                                    self.download_datasets(
                                        project_path,
                                        sample_path,
                                        project,
                                        dataset,
                                    )
                            else:
                                project_path = target_path / project.name
                                sample_path = project_path / dataset.sample.name

                                self.download_datasets(
                                    project_path,
                                    sample_path,
                                    project,
                                    dataset,
                                )

            # If the dataset has no sample and no project
            if (
                dataset.sample is None
                and dataset.projects is None
                and (self._project_ids == [] or 0 in self._project_ids)
            ):
                project_path = target_path / "NoProject"
                sample_path = project_path / "NoSample"
                sample_path.mkdir(parents=True, exist_ok=True)
                dataset_handler = DatasetHandler(
                    dataset_target_path=sample_path,
                    dataset=dataset,
                    include_metadata=self._include_metadata,
                    duplicate_handling=self._duplicate_handling,
                    symlink_path=self._symlink_path,
                    original_target_path=self._target_path,
                )
                dataset_handler.parse_dataset()

                # create symlink if symlink_path is not None
                if self._symlink_path is not None:
                    dataset_handler.create_symlink()
                ProgressBar.update_processed_files()

        ProgressBar.stop_progressbar()