Source code for neuroharmony.data.combine_tools

"""Combining tools."""
from pathlib import Path
import warnings

import pandas as pd

from neuroharmony.data.collect_tools import find_all_files_by_name
from neuroharmony.data.rois import rois


def _files_exists(directory_path, file_pattern):
    """Verify if a file exists and is the unique file of that kind in the folder."""
    file_search = find_all_files_by_name(directory_path, file_pattern)
    if len(file_search) == 0:
        return False
    elif len(file_search) == 1:
        return file_search[0]
    else:
        return file_search
        warnings.warn("There are more than one %s file in this site." % file_pattern)


[docs]def combine_freesurfer(freesurfer_path): """Combine aparc and aseg data files in a single csv files. It uses the list in columns_name.list file to select the relevant features in the freesurfer output. Parameters ---------- freesurfer_path : string The path to the subject directories. Returns ------- combined : NDFrame of shape [n_subjects, n_features] A DataFrame with the FreeSurfer volumetric data for each subject. """ aseg_stats = pd.read_csv(f"{freesurfer_path}/aseg_stats.txt", delimiter="\t") lh_aparc_stats = pd.read_csv(f"{freesurfer_path}/lh_aparc_stats.txt", delimiter="\t") rh_aparc_stats = pd.read_csv(f"{freesurfer_path}/rh_aparc_stats.txt", delimiter="\t") combined = pd.merge(aseg_stats, lh_aparc_stats, left_on="Measure:volume", right_on="lh.aparc.volume") combined = pd.merge(combined, rh_aparc_stats, left_on="Measure:volume", right_on="rh.aparc.volume") combined.rename(columns={"Measure:volume": "participant_id"}, inplace=True) combined.participant_id = combined.participant_id.str.replace("/", "") combined = combined.set_index("participant_id")[rois] return combined
[docs]def combine_mriqc(mri_path=None, group_path=None, mclf_path=None): """Combine group_T1w and mclf files from the MRIQC output. It uses the list in columns_name.list file to select the relevant features in the freesurfer output. Parameters ---------- mri_path: string The path to a BIDS directory. Returns ------- combined : NDFrame of shape [n_subjects, n_features] A DataFrame with the MRIQC information for each subject. """ if group_path is None: iqm_path = _files_exists(mri_path, "group_T1w.tsv") if mclf_path is None: pred_path = _files_exists(mri_path, "mclf*") if not iqm_path or not pred_path: raise FileNotFoundError(f"MRIQC files not found in {mri_path}") if isinstance(pred_path, list): pred_path = "\n".join(pred_path) raise TypeError(f"There are multiple mclf*.csv files at {mri_path}.\n\ Rename undesired mclf files you want to ignore or specify mclf_path.\n\ MCLF files found:\n{pred_path}") iqm = pd.read_csv(iqm_path, header=0, sep="\t") pred = pd.read_csv(pred_path, header=0) pred["bids_name"] = "sub-" + pred.subject_id + "_T1w" pred.drop(columns="subject_id", inplace=True) combined = pd.merge(iqm, pred, left_on="bids_name", right_on="bids_name") combined.rename(columns={"bids_name": "participant_id"}, inplace=True) combined.participant_id = [subject_line[0] for subject_line in combined.participant_id.str.split("_")] combined = combined.set_index("participant_id") return combined
class Site(object): """A class for site definition. Tools for collecting and combining data from a site given a path to the site data. Parameters ---------- dir_path: pathlib.PosixPath Path to the data of the site. The following files are required: freesurferData.csv, participants.tsv, group_T1w.tsv, mclf*.csv, and Qoala*.csv. These files should be inside scanner folders. Attributes ---------- data: NDFrame Dataframe containing the combined data of all scanners in the site. Examples -------- >>> ixi = Site(Path(data/raw/IXI)) >>> ixi.data +---------------+-------------+--------------+-------------+----------+---+----+ | |3rd-Ventricle|4th-Ventricle | Age |Brain-Stem|...|site| +---------------+-------------+--------------+-------------+----------+---+----+ |participant_id | | +---------------+-------------+--------------+-------------+----------+---+----+ |sub-035-0 |0.000528 | 0.000607 | 37.144422 |0.015725 |...|IXI | +---------------+-------------+--------------+-------------+----------+---+----+ |sub-230-0 |0.000751 | 0.001479 | 21.152635 |0.013917 |...|IXI | +---------------+-------------+--------------+-------------+----------+---+----+ |sub-231-0 |0.000861 | 0.001068 | 58.992471 |0.012547 |...|IXI | +---------------+-------------+--------------+-------------+----------+---+----+ |sub-232-0 |0.000478 | 0.000502 | 28.810404 |0.013322 |...|IXI | +---------------+-------------+--------------+-------------+----------+---+----+ |sub-233-0 |0.000420 | 0.000725 | 26.754278 |0.014778 |...|IXI | +---------------+-------------+--------------+-------------+----------+---+----+ >>> ixi.SCANNER01.data +---------------+----------------------+-----------------+---+-------------+ | |Left-Lateral-Ventricle|Left-Inf-Lat-Vent|...|scanner | +---------------+----------------------+-----------------+---+-------------+ |participant_id | | +---------------+----------------------+-----------------+---+-------------+ |sub-002-2 | 0.003274 | 0.000123 |...|IXI-SCANNER01| +---------------+----------------------+-----------------+---+-------------+ |sub-016-2 | 0.015889 | 0.000380 |...|IXI-SCANNER01| +---------------+----------------------+-----------------+---+-------------+ |sub-017-2 | 0.007326 | 0.000148 |...|IXI-SCANNER01| +---------------+----------------------+-----------------+---+-------------+ |... | ... | ... |...| ... | +---------------+----------------------+-----------------+---+-------------+ |sub-582-2 | 0.002624 | 0.000146 |...|IXI-SCANNER01| +---------------+----------------------+-----------------+---+-------------+ |sub-584-2 | 0.004854 | 0.000188 |...|IXI-SCANNER01| +---------------+----------------------+-----------------+---+-------------+ |sub-585-2 | 0.004373 | 0.000218 |...|IXI-SCANNER01| +---------------+----------------------+-----------------+---+-------------+ """ def __init__(self, dir_path, merge_on="image_id", renamed_id="participant_id"): self.dir_path = dir_path self.merge_on = merge_on self.renamed_id = renamed_id self.name = dir_path.name self._get_scanners() def _get_scanners(self): """ Collect data for the site. This method verifies if the site has multiples scanners and combine the data from freesurfer, MRIQC and Qoala. If the data is not available it returns an empty dataframe. """ self.dir_name = self.dir_path.name subdirs = [subpath.name for subpath in self.dir_path.glob("*") if subpath.is_dir()] if len(subdirs) == 0: self.scanner_list = [] self._get_files() if self._is_complete(): self._load_files() self._combine_files() self.data["scanner"] = self.name self.data.index = self.data[self.renamed_id] + "-00" self.complete_scanners = [self.name] else: self.data = pd.DataFrame() else: self.scanner_list = [subdir.replace(f"{self.dir_name}-", "") for subdir in subdirs] for scanner_name in subdirs: scanner = Scanner( self.dir_path / Path(scanner_name), merge_on=self.merge_on, renamed_id=self.renamed_id ) scanner_name = scanner_name.replace(f"{self.dir_name}-", "") setattr(self, scanner_name, scanner) scanner._get_files() self._combine_all_scanners() self.data["site"] = self.name def _files_exists(self, directory_path, file_pattern): """Verify if a file exists and is the unique file of that kind in the folder.""" file_search = find_all_files_by_name(directory_path, file_pattern) if len(file_search) == 0: return False elif len(file_search) == 1: return file_search[0] elif len(file_search) >= 1: error_str = "\n".join([str(p) for p in file_search]) warnings.warn(f"There are multiple mclf*.csv files at {directory_path}.\n" "Rename undesired mclf files you want to ignore or specify mclf_path.\n" f"MCLF files found:\n{error_str}\nUsing: {file_search[0]}") return file_search[0] def _get_files(self): """Verify if each of the files exist and is unique.""" self.freesurferData_path = self._files_exists(self.dir_path, "freesurferData.csv") self.participants_path = self._files_exists(self.dir_path, "participants.tsv") self.iqm_path = self._files_exists(self.dir_path, "group_T1w.tsv") self.pred_path = self._files_exists(self.dir_path, "mclf*csv") self.qoala_path = self._files_exists(self.dir_path, "Qoala*.csv") def _is_complete(self): """Verify if all necessary files are there.""" return all([self.freesurferData_path, self.participants_path, self.iqm_path, self.pred_path, self.qoala_path]) def _load_files(self): self.freesurferData = pd.read_csv(self.freesurferData_path, header=0) self.participants = pd.read_csv(self.participants_path, header=0, sep="\t") self.iqm = pd.read_csv(self.iqm_path, header=0, sep="\t") self.pred = pd.read_csv(self.pred_path, header=0) self.qoala = pd.read_csv(self.qoala_path, header=0) def _combine_files(self): if all( self.merge_on in dataframe for dataframe in [self.freesurferData, self.participants, self.iqm, self.pred, self.qoala] ): self.data = pd.merge(self.participants, self.freesurferData, on=self.merge_on, how="inner") self.data = pd.merge(self.data, self.iqm, on=self.merge_on, how="inner") self.data = pd.merge(self.data, self.pred, on=self.merge_on, how="inner") self.data = pd.merge(self.data, self.qoala, on=self.merge_on, how="inner") self.data[self.renamed_id] = self.data[self.merge_on] else: warnings.warn("The field %s is not found in %s." % (self.merge_on, self.dir_path.name)) self.data = pd.DataFrame([""], index=[""], columns=[self.renamed_id]) def _combine_all_scanners(self): """If there are multiple scanners on a site they will be combined in the attribute 'data'.""" n_scanners = len(self.scanner_list) participant_id_format = "-%%0%dd" % len(str(n_scanners)) for scanner_id, scanner_name in enumerate(self.scanner_list): if getattr(self, scanner_name)._is_complete(): getattr(self, scanner_name)._load_files() getattr(self, scanner_name)._combine_files() scanner_data = getattr(self, scanner_name).data if len(scanner_data) > 0: scanner_data["scanner"] = "%s-%s" % (self.name, scanner_name) try: id_appendix = participant_id_format % scanner_id scanner_data.index = scanner_data[self.renamed_id] + str(id_appendix) except TypeError: print(self.dir_path.name) self.complete_scanners = [ scanner_name for scanner_name in self.scanner_list if getattr(self, scanner_name)._is_complete() ] if len(self.complete_scanners) > 0: self.data = pd.concat([getattr(self, scanner).data for scanner in self.complete_scanners], sort=True) else: self.data = pd.DataFrame() class Scanner(Site): """ A class for scanner definition. Tools for collecting and combining data from a single scanner given a path to the scanner data. Parameters ---------- dir_path: pathlib.PosixPath Path to the data of the scanner. The following files are required: freesurferData.csv, participants.tsv, group_T1w.tsv, mclf*.csv, and Qoala*.csv. Attributes ---------- data: NDFrame Dataframe containing the combined data of all scanners in the site. Examples -------- >>> scanner_01 = Site(Path(data/raw/IXI/SCANNER01)) >>> scanner_01.data Left-Lateral-Ventricle Left-Inf-Lat-Vent ... scanner participant_id sub-002-2 0.003274 0.000123 ... IXI-SCANNER01 sub-016-2 0.015889 0.000380 ... IXI-SCANNER01 sub-017-2 0.007326 0.000148 ... IXI-SCANNER01 ... ... ... ... ... sub-582-2 0.002624 0.000146 ... IXI-SCANNER01 sub-584-2 0.004854 0.000188 ... IXI-SCANNER01 sub-585-2 0.004373 0.000218 ... IXI-SCANNER01 [313 rows × 179 columns] """ def __init__(self, dir_path, merge_on="image_id", renamed_id="participant_id"): self.dir_path = dir_path self.name = dir_path.name self.merge_on = merge_on self.renamed_id = renamed_id class DataSet(Site): """ A class for dataset definition. Tools for collecting and combining data from all sites given a path to a BIDS dataset. Parameters ---------- dir_path: pathlib.PosixPath Path to the data of the site. The following files are required: freesurferData.csv, participants.tsv, group_T1w.tsv, mclf*.csv, and Qoala*.csv. These files should be inside scanner folders which should be inside site folders. Attributes ---------- data: NDFrame Dataframe containing the combined data of all scanners in the site. Examples -------- >>> DATASET = DataSet(Path(data/raw/)) >>> Data.data 3rd-Ventricle 4th-Ventricle Age Brain-Stem ... site participant_id sub-035-00-00 0.000528 0.000607 37.144422 0.015725 ... IXI sub-230-00-00 0.000751 0.001479 21.152635 0.013917 ... IXI sub-231-00-00 0.000861 0.001068 58.992471 0.012547 ... IXI ... ... ... ... ... ... ... sub-45-00-33 0.000570 0.001581 21.0 0.011838 ... MoralJudgment sub-46-00-33 0.000552 0.001528 36.0 0.013947 ... MoralJudgment sub-47-00-33 0.000609 0.000893 20.0 0.013515 ... MoralJudgment [9303 rows × 180 columns] >>> DATASET.IXI.data 3rd-Ventricle 4th-Ventricle Age Brain-Stem ... site participant_id sub-035-0 0.000528 0.000607 37.144422 0.015725 ... IXI sub-230-0 0.000751 0.001479 21.152635 0.013917 ... IXI sub-231-0 0.000861 0.001068 58.992471 0.012547 ... IXI sub-232-0 0.000478 0.000502 28.810404 0.013322 ... IXI sub-233-0 0.000420 0.000725 26.754278 0.014778 ... IXI [535 rows x 180 columns] >>> DATASET.IXI.SCANNER01.data Left-Lateral-Ventricle Left-Inf-Lat-Vent ... scanner participant_id sub-002-2 0.003274 0.000123 ... IXI-SCANNER01 sub-016-2 0.015889 0.000380 ... IXI-SCANNER01 sub-017-2 0.007326 0.000148 ... IXI-SCANNER01 ... ... ... ... ... sub-582-2 0.002624 0.000146 ... IXI-SCANNER01 sub-584-2 0.004854 0.000188 ... IXI-SCANNER01 sub-585-2 0.004373 0.000218 ... IXI-SCANNER01 [313 rows × 179 columns] """ def __init__(self, dir_path, merge_on="image_id", renamed_id="participant_id"): self.dir_path = dir_path self.name = "All sites" self.merge_on = merge_on self.renamed_id = renamed_id self._get_sites() self._combine_all_sites() def _get_sites(self): self.site_paths = [subpath for subpath in self.dir_path.glob("*") if subpath.is_dir()] self.sites = [site_path.name for site_path in self.site_paths] self.not_empty_sites = [] for site_path in self.site_paths: site = Site(site_path, merge_on=self.merge_on, renamed_id="participant_id") setattr(self, site.name, site) self.not_empty_sites.append(site.name) def _combine_all_sites(self): n_sites = len(self.not_empty_sites) participant_id_format = "-%%0%dd" % len(str(n_sites)) self.data = [] if n_sites > 0: for site_id, site_name in enumerate(self.not_empty_sites): id_appendix = participant_id_format % site_id site = getattr(self, site_name) site.data.index = site.data.index + id_appendix self.data.append(site.data) self.data = pd.concat(self.data, sort=True) else: self.data = False if __name__ == "__main__": DATASET = DataSet(Path("data/raw/")) DATASET.data.to_csv("data/processed/dataset.csv")