diff --git a/datashuttle/datashuttle_class.py b/datashuttle/datashuttle_class.py index 9322323f..5a884c73 100644 --- a/datashuttle/datashuttle_class.py +++ b/datashuttle/datashuttle_class.py @@ -104,7 +104,6 @@ class DataShuttle: """ def __init__(self, project_name: str, print_startup_message: bool = True): - self._error_on_base_project_name(project_name) self.project_name = project_name ( @@ -329,6 +328,7 @@ def upload_custom( sub_names: Union[str, list], ses_names: Union[str, list], datatype: Union[List[str], str] = "all", + ignore_files: Union[str, list] = "", overwrite_existing_files: OverwriteExistingFiles = "never", dry_run: bool = False, init_log: bool = True, @@ -362,6 +362,11 @@ def upload_custom( The (broad or narrow) NeuroBlueprint datatypes to transfer. If "all", any broad or narrow datatype folder will be transferred. + ignore_files : + A list of files to ignore during transfer. This can + include wildcards (e.g. "*.json"). This is passed + to rclone as an include list. + overwrite_existing_files : If `False`, files on central will never be overwritten by files transferred from local. If `True`, central files @@ -392,7 +397,7 @@ def upload_custom( datatype, overwrite_existing_files, dry_run, - log=True, + "" if self._check_ignore_files(ignore_files) else ignore_files, ) if init_log: @@ -406,6 +411,7 @@ def download_custom( sub_names: Union[str, list], ses_names: Union[str, list], datatype: Union[List[str], str] = "all", + ignore_files: Union[str, list] = "", overwrite_existing_files: OverwriteExistingFiles = "never", dry_run: bool = False, init_log: bool = True, @@ -435,6 +441,11 @@ def download_custom( datatype : see create_folders() + ignore_files : + A list of files to ignore during transfer. This can + include wildcards (e.g. "*.json"). This is passed + to rclone as an include list. + overwrite_existing_files : If "never" files on target will never be overwritten by source. If "always" files on target will be overwritten by source if @@ -466,7 +477,7 @@ def download_custom( datatype, overwrite_existing_files, dry_run, - log=True, + "" if self._check_ignore_files(ignore_files) else ignore_files, ) if init_log: @@ -1338,7 +1349,6 @@ def _transfer_entire_project( local to central) or "download" (from central to local). """ for top_level_folder in canonical_folders.get_top_level_folders(): - utils.log_and_message(f"Transferring `{top_level_folder}`") self._transfer_top_level_folder( @@ -1489,8 +1499,7 @@ def _update_persistent_setting( if setting_name not in settings: utils.log_and_raise_error( - f"Setting key {setting_name} not found in " - f"settings dictionary", + f"Setting key {setting_name} not found in settings dictionary", KeyError, ) @@ -1571,3 +1580,10 @@ def _check_top_level_folder(self, top_level_folder): f"{canonical_top_level_folders}", ValueError, ) + + def _check_ignore_files(self, ignore_files: Union[str, list]): + """ + Check if there are any files or folders to be ignored. + """ + test_list = [""] + return test_list == ignore_files diff --git a/datashuttle/tui/custom_widgets.py b/datashuttle/tui/custom_widgets.py index dc7f5888..d206835d 100644 --- a/datashuttle/tui/custom_widgets.py +++ b/datashuttle/tui/custom_widgets.py @@ -384,6 +384,21 @@ def get_sub_ses_names_and_datatype( return sub_names, ses_names, datatype + def get_ignore_files(self, ignore_files_input_key: str) -> List[str]: + """ + Get the ignore files from the input widget. + Parameters + ---------- + ignore_files_input_key : str + The textual widget id for the ignore files input (prefixed with #) + Returns + A list of ignore files. + ------- + """ + ignore_files = self.query_one(ignore_files_input_key).as_names_list() + + return ignore_files + class TopLevelFolderSelect(Select): """ @@ -470,7 +485,6 @@ def on_select_changed(self, event: Select.Changed) -> None: top_level_folder = event.value if event.value != Select.BLANK: - self.interface.save_tui_settings( top_level_folder, "top_level_folder_select", self.settings_key ) diff --git a/datashuttle/tui/interface.py b/datashuttle/tui/interface.py index e9520bb0..247e1447 100644 --- a/datashuttle/tui/interface.py +++ b/datashuttle/tui/interface.py @@ -297,6 +297,7 @@ def transfer_custom_selection( sub_names: List[str], ses_names: List[str], datatype: List[str], + ignore_files: List[str], upload: bool, ) -> InterfaceOutput: """ @@ -317,6 +318,9 @@ def transfer_custom_selection( datatype : List[str] Datatypes or datatype-level canonical transfer keys to transfer. + ignore_files : List[str] + List of files to ignore during transfer. + upload : bool Upload from local to central if `True`, otherwise download from central to remote. @@ -332,6 +336,7 @@ def transfer_custom_selection( sub_names=sub_names, ses_names=ses_names, datatype=datatype, + ignore_files=ignore_files, overwrite_existing_files=self.tui_settings[ "overwrite_existing_files" ], diff --git a/datashuttle/tui/tabs/transfer.py b/datashuttle/tui/tabs/transfer.py index cd9ba9c4..4c08d2ba 100644 --- a/datashuttle/tui/tabs/transfer.py +++ b/datashuttle/tui/tabs/transfer.py @@ -135,6 +135,14 @@ def compose(self) -> ComposeResult: id="transfer_session_input", placeholder="e.g. ses-001", ), + Label("Ignore File(s)", id="transfer_ignore_file_label"), + ClickableInput( + self.mainwindow, + id="transfer_ignore_file_input", + placeholder="e.g. *.mp4, folder/", + validate_on=None, + validators=None, + ), # These are almost identical to create tab Label("Datatype(s)", id="transfer_datatype_label"), DatatypeCheckboxes( @@ -214,6 +222,7 @@ def on_mount(self) -> None: "#transfer_switch_container", "#transfer_subject_input", "#transfer_session_input", + "#transfer_ignore_file_input", "#transfer_all_checkbox", "#transfer_all_datatype_checkbox", "#transfer_all_non_datatype_checkbox", @@ -393,11 +402,13 @@ def transfer_data(self) -> Worker[InterfaceOutput]: "#transfer_subject_input", "#transfer_session_input" ) ) + ignore_files = self.get_ignore_files("#transfer_ignore_file_input") success, output = self.interface.transfer_custom_selection( selected_top_level_folder, sub_names, ses_names, datatype, + ignore_files, upload, ) diff --git a/datashuttle/tui/tooltips.py b/datashuttle/tui/tooltips.py index e8d0c771..28c20ce7 100644 --- a/datashuttle/tui/tooltips.py +++ b/datashuttle/tui/tooltips.py @@ -201,6 +201,13 @@ def get_tooltip(id: str) -> str: "Use 'all_non_ses' to transfer all other folders only (i.e. that do not start with the 'ses-' prefix)." ) + elif id == "#transfer_ignore_file_input": + tooltip = ( + "A list of files or folders to ignore during transfer. " + "Use wildcards to match any part of a filename e.g. *.mp4, folder/.\n\n" + "Folders must end with '/' suffix, otherwise it will be treated as a file.\n\n" + ) + # 'all', 'all datatype', 'all non datatype' elif id == "#transfer_all_checkbox": tooltip = "Select to transfer all datatype and non-datatype folders within sessions." diff --git a/datashuttle/utils/data_transfer.py b/datashuttle/utils/data_transfer.py index 21121b8e..32d5e65e 100644 --- a/datashuttle/utils/data_transfer.py +++ b/datashuttle/utils/data_transfer.py @@ -1,5 +1,5 @@ -from pathlib import Path -from typing import List, Literal, Optional, Union +from pathlib import Path, PosixPath +from typing import List, Literal, Optional, Tuple, Union from datashuttle.configs import canonical_folders from datashuttle.configs.config_class import Configs @@ -69,7 +69,8 @@ def __init__( datatype: Union[str, List[str]], overwrite_existing_files: OverwriteExistingFiles, dry_run: bool, - log: bool, + ignore_files: Union[str, List[str]] = "", + log: bool = True, ): self.__cfg = cfg self.__upload_or_download = upload_or_download @@ -84,33 +85,44 @@ def __init__( self.sub_names = self.to_list(sub_names) self.ses_names = self.to_list(ses_names) self.datatype = self.to_list(datatype) + self.ignore_files = self.to_list(ignore_files) if ignore_files else [] self.check_input_arguments() - include_list = self.build_a_list_of_all_files_and_folders_to_transfer() + include_list, exclude_list = ( + self.build_a_list_of_all_files_and_folders_to_transfer() + ) + + transfer_file = self.make_transfer_arg(include_list, exclude_list) if any(include_list): output = rclone.transfer_data( self.__cfg, self.__upload_or_download, self.__top_level_folder, - include_list, + transfer_file, cfg.make_rclone_transfer_options( overwrite_existing_files, dry_run ), ) - if log: - utils.log_and_message(output.stderr.decode("utf-8")) - else: - if log: - utils.log_and_message("No files included. None transferred.") + self.reset_transfer_file() + + if log: + message = ( + output.stderr.decode("utf-8") + if any(include_list) + else "No files included. None transferred." + ) + utils.log_and_message(message) # ------------------------------------------------------------------------- - # Build the --include list + # Build the --filter-from list # ------------------------------------------------------------------------- - def build_a_list_of_all_files_and_folders_to_transfer(self) -> List[str]: + def build_a_list_of_all_files_and_folders_to_transfer( + self, + ) -> Tuple[List[str], List[str]]: """ Build a list of every file to transfer based on the user-passed arguments. This cycles through every subject, session and datatype @@ -132,6 +144,7 @@ def build_a_list_of_all_files_and_folders_to_transfer(self) -> List[str]: sub_ses_dtype_include: List[str] = [] extra_folder_names: List[str] = [] extra_filenames: List[str] = [] + exclude_list: List[str] = [] for sub in processed_sub_names: # subjects at top level folder ------------------------------------ @@ -175,15 +188,40 @@ def build_a_list_of_all_files_and_folders_to_transfer(self) -> List[str]: ) include_list = ( - self.make_include_arg(sub_ses_dtype_include) - + self.make_include_arg(extra_folder_names) - + self.make_include_arg(extra_filenames, recursive=False) + self.make_include_arg( + sub_ses_dtype_include, exclude_files=any(self.ignore_files) + ) + + self.make_include_arg( + extra_folder_names, exclude_files=any(self.ignore_files) + ) + + self.make_include_arg( + extra_filenames, + recursive=False, + exclude_files=any(self.ignore_files), + ) ) - return include_list + if self.ignore_files: + excluded_files, excluded_folders = ( + self.update_list_with_excluded_paths( + self.ignore_files, + sub_ses_dtype_include=sub_ses_dtype_include, + extra_folder_names=extra_folder_names, + extra_filenames=extra_filenames, + ) + ) + + exclude_list = self.make_exclude_arg( + excluded_folders + ) + self.make_exclude_arg(excluded_files, recursive=False) + + return include_list, exclude_list def make_include_arg( - self, list_of_paths: List[str], recursive: bool = True + self, + list_of_paths: List[str], + recursive: bool = True, + exclude_files: bool = False, ) -> List[str]: """ Format the list of paths to rclone's required @@ -194,22 +232,79 @@ def make_include_arg( if recursive: - def include_arg(ele: str) -> str: + def include_arg(ele: str, exclude_files: bool = False) -> str: + if exclude_files: + return f" + {ele}/** " return f' --include "{ele}/**" ' else: - def include_arg(ele: str) -> str: + def include_arg(ele: str, exclude_files: bool = False) -> str: + if exclude_files: + return f" + {ele} " return f' --include "{ele}" ' - return ["".join([include_arg(ele) for ele in list_of_paths])] + return [include_arg(ele, exclude_files) for ele in list_of_paths] + + def make_exclude_arg( + self, list_of_paths: List[str], recursive: bool = True + ) -> List[str]: + """ + Format the list of paths to rclone's required + `--exclude` flag format. + """ + if not any(list_of_paths): + return [] + + if recursive: + + def exclude_arg(ele: str) -> str: + return f" - {ele}/** " + + else: + + def exclude_arg(ele: str) -> str: + return f" - {ele} " + + return [exclude_arg(ele) for ele in list_of_paths] + + def make_transfer_arg( + self, include_files: List[str], exclude_files: List[str] + ) -> List[str]: + """ + Format the list of paths to rclone's required + `--filter-from` flag format. + """ + if exclude_files: + ignore_path: PosixPath = self.write_transfer_file( + include_files, exclude_files + ) + return [f' --filter-from "{ignore_path}" '] + + return ["".join(include_files)] + + def write_transfer_file( + self, include_files: List[str], exclude_files: List[str] + ) -> PosixPath: + """ + Write the list of files to transfer to a file + """ + file_path: PosixPath = self.get_datashuttle_ignore_path(self.__cfg) + + with open(file_path, "w") as f: + f.write("\n".join(exclude_files + include_files)) + f.write("\n - **") + + return file_path # ------------------------------------------------------------------------- # Search for non-sub / ses / dtype folders and add them to list # ------------------------------------------------------------------------- def update_list_with_non_sub_top_level_folders( - self, extra_folder_names: List[str], extra_filenames: List[str] + self, + extra_folder_names: List[str], + extra_filenames: List[str], ) -> None: """ Search the subject level for all files and folders in the @@ -347,6 +442,49 @@ def update_list_with_dtype_paths( sub_ses_dtype_include.append(filepath.as_posix()) + # ------------------------------------------------------------------------- + # Update list with files to exclude inside included transfer paths + # ------------------------------------------------------------------------- + + def update_list_with_excluded_paths( + self, + ignore_files: List[str], + sub_ses_dtype_include: List[str], + extra_folder_names: List[str], + extra_filenames: List[str], + ) -> Tuple[List[str], List[str]]: + """ + Update the include list with the files to exclude + from transfer. These are passed as a list of strings + and will be formatted to rclone's `--filter-from` format. + """ + + ignored_extra_files: List[str] = [] + factually_ignored_files: List[str] = [] + ignored_sub_ses_files: List[str] = [] + factually_ignored_folders: List[str] = [] + + if extra_filenames: + ignored_extra_files += folders.search_for_ignore_extra_files( + ignore_files, + extra_filenames, + self.__base_folder, + ) + + if sub_ses_dtype_include or extra_folder_names: + factually_ignored_folders, ignored_sub_ses_files = ( + folders.search_for_ignore_files_in_folders( + ignore_files, + sub_ses_dtype_include, + extra_folder_names, + self.__base_folder, + ) + ) + + factually_ignored_files = ignored_extra_files + ignored_sub_ses_files + + return factually_ignored_files, factually_ignored_folders + # ------------------------------------------------------------------------- # Utils # ------------------------------------------------------------------------- @@ -412,6 +550,19 @@ def check_input_arguments( ValueError, ) + def get_datashuttle_ignore_path(self, cfg: Configs) -> PosixPath: + """ + Return the path to the .datashuttleignore file + """ + return cfg["local_path"] / ".datashuttle/.datashuttleignore" + + def reset_transfer_file(self) -> None: + """ + Reset the .datashuttleignore file + """ + with open(self.get_datashuttle_ignore_path(self.__cfg), "w") as f: + f.write("") + # ------------------------------------------------------------------------- # Format Arguments # ------------------------------------------------------------------------- diff --git a/datashuttle/utils/folders.py b/datashuttle/utils/folders.py index 56852640..e62cbb1a 100644 --- a/datashuttle/utils/folders.py +++ b/datashuttle/utils/folders.py @@ -150,7 +150,6 @@ def make_datatype_folders( for datatype_key, datatype_folder in datatype_items: # type: ignore if datatype_folder.level == level: - datatype_name = datatype_folder.name datatype_path = sub_or_ses_level_path / datatype_name @@ -471,8 +470,7 @@ def search_sub_or_ses_level( """ if ses and not sub: utils.log_and_raise_error( - "cannot pass session to " - "search_sub_or_ses_level() without subject", + "cannot pass session to search_sub_or_ses_level() without subject", ValueError, ) @@ -552,7 +550,6 @@ def search_filesystem_path_for_folders( sorter_files_and_folders = sorted(all_files_and_folders) for file_or_folder_str in sorter_files_and_folders: - file_or_folder = Path(file_or_folder_str) if file_or_folder.is_dir(): @@ -565,3 +562,85 @@ def search_filesystem_path_for_folders( ) return all_folder_names, all_filenames + + +# ----------------------------------------------------------------------------- +# Search RegEx ignore files/folders +# ----------------------------------------------------------------------------- + + +def search_for_ignore_extra_files( + ignored_patterns: List[str], + extra_filenames: List[str], + base_folder: Path, +) -> List[str]: + """ + Search for files or folders that match the regex patterns + """ + + ignored_paths: List[str] = [] + + files, _ = utils.split_files_and_folders_regex(ignored_patterns) + subjects: List[str] = utils.search_sub_match(extra_filenames) + + if files: + ignored_paths += find_match_exclude_patterns( + files, subjects, base_folder + ) + + return ignored_paths + + +def search_for_ignore_files_in_folders( + ignored_patterns: List[str], + sub_ses_dtype_include: List[str], + extra_folder_names: List[str], + base_folder: Path, +) -> Tuple[List[str], List[str]]: + """ + Search for files or folders that match the regex patterns + """ + ignored_sub_ses_folders: List[str] = [] + ignored_sub_ses_files: List[str] = [] + ignored_extra_folders: List[str] = [] + + files, folders = utils.split_files_and_folders_regex(ignored_patterns) + subjects: List[str] = utils.search_sub_match(extra_folder_names) + + if folders: + ignored_sub_ses_folders += find_match_exclude_patterns( + folders, sub_ses_dtype_include, base_folder + ) + + ignored_extra_folders += find_match_exclude_patterns( + folders, subjects, base_folder + ) + + if files: + ignored_sub_ses_files += find_match_exclude_patterns( + files, sub_ses_dtype_include, base_folder + ) + + return ( + (ignored_sub_ses_folders + ignored_extra_folders), + ignored_sub_ses_files, + ) + + +def find_match_exclude_patterns( + files_or_folders_pattern: List[str], + sub_ses_dtype_include: List[str], + base_folder: Path, +) -> List[str]: + """ + Search for subject (or extra) that match excluded files or folders + """ + ignored_folders: List[str] = [] + + for pattern in files_or_folders_pattern: + for sub_ses in sub_ses_dtype_include: + ignored_folders += glob.glob( + (base_folder / sub_ses / pattern).as_posix(), + recursive=True, + ) + return [utils.find_sub_match(path) for path in ignored_folders] diff --git a/datashuttle/utils/utils.py b/datashuttle/utils/utils.py index 87a39e5c..d489fe6f 100644 --- a/datashuttle/utils/utils.py +++ b/datashuttle/utils/utils.py @@ -138,7 +138,6 @@ def get_values_from_bids_formatted_name( """ all_values = [] for name in all_names: - if key not in name: raise NeuroBlueprintError( f"The key {key} is not found in {name}", KeyError @@ -224,3 +223,37 @@ def all_identical(list_: List) -> bool: Check that all values in a list are identical. """ return len(set(list_)) == 1 + + +def split_files_and_folders_regex( + ignored_files: List[str], +) -> tuple[List[str], List[str]]: + """ + Split the files and folders from a list of datashuttleignore patterns. + """ + folder_pattern = re.compile(r".*/$") + + folders = [file for file in ignored_files if folder_pattern.match(file)] + files = [file for file in ignored_files if not folder_pattern.match(file)] + + return files, folders + + +def find_sub_match( + path: str, +) -> str: + """ + Find the sub match in a path. + """ + match = re.findall(r"(sub-\d+/.*)", path) + return match.pop() if match else "" + + +def search_sub_match(extra_filenames: List[str]) -> List[str]: + """ """ + + return [ + match.group(1) + for path in extra_filenames + if (match := re.search(r"(sub-\d+)", path)) + ] diff --git a/tests/test_utils.py b/tests/test_utils.py index ad908160..dc1ed18d 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -397,8 +397,10 @@ def make_local_folders_with_files_in( project.create_folders(top_level_folder, subs, sessions, datatype) for root, dirs, _ in os.walk(project.cfg["local_path"]): if not dirs: - path_ = Path(root) / "placeholder_file.txt" - write_file(path_, contents="placeholder") + placeholders = ["placeholder_file.txt", "placeholder_movie.mp4"] + path_ = [Path(root) / placeholder for placeholder in placeholders] + for path_ in path_: + write_file(path_, contents="placeholder") # ----------------------------------------------------------------------------- @@ -671,9 +673,9 @@ def check_working_top_level_folder_only_exists( def read_log_file(logging_path): log_filepath = list(glob.glob(str(logging_path / "*.log"))) - assert len(log_filepath) == 1, ( - f"there should only be one log " f"in log output path {logging_path}" - ) + assert ( + len(log_filepath) == 1 + ), f"there should only be one log in log output path {logging_path}" log_filepath = log_filepath[0] with open(log_filepath, "r") as file: diff --git a/tests/tests_tui/test_tui_transfer.py b/tests/tests_tui/test_tui_transfer.py index 3e8c389f..309f40c9 100644 --- a/tests/tests_tui/test_tui_transfer.py +++ b/tests/tests_tui/test_tui_transfer.py @@ -24,7 +24,6 @@ async def test_transfer_entire_project( app = TuiApp() async with app.run_test(size=self.tui_size()) as pilot: - await self.check_and_click_onto_existing_project( pilot, project_name ) @@ -118,7 +117,6 @@ async def test_transfer_top_level_folder( app = TuiApp() async with app.run_test(size=self.tui_size()) as pilot: - await self.check_and_click_onto_existing_project( pilot, project_name ) @@ -154,9 +152,16 @@ async def test_transfer_top_level_folder( @pytest.mark.parametrize("top_level_folder", ["rawdata", "derivatives"]) @pytest.mark.parametrize("upload_or_download", ["upload", "download"]) + @pytest.mark.parametrize( + "ignored_files", ["", "*.mp4", "placeholder_movie.mp4"] + ) @pytest.mark.asyncio async def test_transfer_custom( - self, setup_project_paths, top_level_folder, upload_or_download + self, + setup_project_paths, + top_level_folder, + upload_or_download, + ignored_files, ): tmp_config_path, tmp_path, project_name = setup_project_paths.values() @@ -167,7 +172,6 @@ async def test_transfer_custom( app = TuiApp() async with app.run_test(size=self.tui_size()) as pilot: - await self.check_and_click_onto_existing_project( pilot, project_name ) @@ -193,6 +197,10 @@ async def test_transfer_custom( pilot, "#transfer_session_input", ses_to_transfer ) + await self.fill_input( + pilot, "#transfer_ignore_file_input", ignored_files + ) + await self.scroll_to_click_pause( pilot, "#transfer_all_checkbox" ) # turn this off @@ -228,7 +236,6 @@ async def test_transfer_custom( async def switch_top_level_folder_select( self, pilot, id, top_level_folder ): - if top_level_folder == "rawdata": assert pilot.app.screen.query_one(id).value == "rawdata" else: