Source code for neuroharmony.data.collect_tools

"""Collect tools."""
from pathlib import Path
from requests import get
from shutil import copyfile
from zipfile import ZipFile
import joblib
import os

from pandas import read_csv
from tqdm import tqdm


def _download(url, filepath):
    dirpath = Path(filepath).parent
    Path(dirpath).mkdir(exist_ok=True)
    headers = {"user-agent": "Wget/1.16 (linux-gnu)"}
    r = get(url, stream=True, headers=headers)
    total_size = int(r.headers.get("content-length", 0))
    block_size = 1024
    t = tqdm(total=total_size, unit="iB", unit_scale=True)
    with open(filepath, "wb") as f:
        for data in r.iter_content(block_size):
            t.update(len(data))
            f.write(data)
    t.close()


[docs]def fetch_mri_data(): """Fetch example of MRI dataset. The dataset is a replication of the Bert subject released with the FreeSurfer software for testing. Returns ======= mri_path: str The path for the MRI data. """ script_path = os.path.dirname(os.path.abspath(__file__)) filepath = f"{script_path}/../../data/mri.zip" unzip_folder = str(Path(filepath).parent) + "/mri/" if not Path(filepath).exists(): _download("https://www.dropbox.com/s/kcbq0266bcab3bx/ds002936.zip", filepath) Path(unzip_folder).mkdir(exist_ok=True) zip_file = ZipFile(filepath, "r") zip_file.extractall(unzip_folder) zip_file.close() os.remove(filepath) return str(Path(unzip_folder).absolute())
[docs]def fetch_sample(): """Fetch a sample of FreeSurfer derived volumes in the Neuroharmony format. Fetch the FreeSurfer derived volumes of some subjects in the `ADHD200 <http://fcon_1000.projects.nitrc.org/indi/adhd200/index.html>`_ and in the `PPMI <http://www.ppmi-info.org/>`_ datasets. Returns ======= dataset: NDFrame of shape [n_subjects, n_features] DataFrame with data from ADHD200 and the PPMI subjects in the Neuroharmony format. """ script_path = os.path.dirname(os.path.abspath(__file__)) filepath = f"{script_path}/../../data/test_sample.csv" _download("https://www.dropbox.com/s/mxcaqx2y29n09rp/test_sample.csv", filepath) return read_csv(filepath, index_col=0)
[docs]def fetch_trained_model(): """Fetch Neuroharmony pre-trained model. Returns ======= neuroharmony: Neuroharmony class Pre-trained Neuroharmony model. """ script_path = os.path.dirname(os.path.abspath(__file__)) filepath = f"{script_path}/../../data/neuroharmony.pkl.gz" if not Path(filepath).exists(): _download("https://www.dropbox.com/s/s3521oqd3fpi0ll/neuroharmony.pkl.gz", filepath) try: return joblib.load(filepath) except (KeyError, ValueError): Path(filepath).unlinke() _download("https://www.dropbox.com/s/s3521oqd3fpi0ll/neuroharmony.pkl.gz", filepath) return joblib.load(filepath)
def find_all_files_by_name(directory_path, file_pattern, depth=2): """Find all files in a folder. Parameters ========== directory_path: string The path for a giving folder. file_pattern: string File extension (.csv, .pdf, .txt, ...). depth: int Depth of the file search. Returns ======= filelist: list List of paths to the found files in the format of pathlib.PosixPath. """ filelist = [] for level in range(depth): filelist.extend(list(Path(directory_path).glob("/".join(level * ["*"] + [file_pattern])))) return filelist def collect_datafile(filepath, root_path, local_path): """Collect a datafile. Parameters ========== filepath: string or pathlib.PosixPath The path for the file to be copied. root_path: string The path root to the origin of the data. local_path: string The path to the local folder you want to save the copied data. Returns ======= file_exists: boolean Returns True if the files were copied correctly and False otherwise. """ filepath = str(filepath) local_final_path = filepath.replace(root_path, local_path) Path(local_final_path).parent.mkdir(parents=True, exist_ok=True) copyfile(filepath, local_final_path) return Path(local_final_path).exists() def collect_multiple_datafile(filepath_list, root_path, local_path): """Collect a list of datafiles. Parameters ========== filepath_list: list of strings or pathlib.PosixPath List of paths for the file to be copied. root_path: string The path root to the origin of the data. local_path: string The path to the local folder you want to save the copied data. Returns ======= file_exists: boolean Returns True if the files were copied correctly and False otherwise. """ for filepath in filepath_list: collect_datafile(filepath, root_path, local_path) if __name__ == "__main__": SERVER_ROOT = "/media/kcl_2/HDD/SynologyDrive" PARTICIPANT_ROOT = "%s/BIDS_data/" % SERVER_ROOT FREESURFER_ROOT = "%s/FreeSurfer_preprocessed/" % SERVER_ROOT QOALA_ROOT = "%s/Qoala/" % SERVER_ROOT MRIQC_ROOT = "%s/MRIQC/" % SERVER_ROOT Path("./data/processed").mkdir(exist_ok=True, parents=True) PARTICIPANTS_FILES = find_all_files_by_name(PARTICIPANT_ROOT, "participants.tsv", depth=3) for file_path in PARTICIPANTS_FILES: df = read_csv(file_path, header=0, sep="\t") df["image_id"] = df[["participant_id", "session_id", "acq_id", "run_id"]].agg("_".join, axis=1) + "_T1w" df.to_csv(file_path, index=False, sep="\t") FSURFER_FILES = find_all_files_by_name(FREESURFER_ROOT, "freesurferData.csv", depth=3) QOALA_FILES = find_all_files_by_name(QOALA_ROOT, "Qoala*.csv", depth=3) MRIQC_GROUP_FILES = find_all_files_by_name(MRIQC_ROOT, "group_T1w.tsv", depth=3) MRIQC_PRED_FILES = find_all_files_by_name(MRIQC_ROOT, "*pred.csv", depth=3) collect_multiple_datafile(PARTICIPANTS_FILES, PARTICIPANT_ROOT, "./data/raw/") collect_multiple_datafile(FSURFER_FILES, FREESURFER_ROOT, "./data/raw/") collect_multiple_datafile(QOALA_FILES, QOALA_ROOT, "./data/raw/") collect_multiple_datafile(MRIQC_GROUP_FILES, MRIQC_ROOT, "./data/raw/") collect_multiple_datafile(MRIQC_PRED_FILES, MRIQC_ROOT, "./data/raw/")