diff --git a/cadetrdm/initialize_repo.py b/cadetrdm/initialize_repo.py index 7ccda72eb48a4fbf406fb9d3bb5192d38255aeff..304c625faca2a483d7cc3385d99b71e339be9f60 100644 --- a/cadetrdm/initialize_repo.py +++ b/cadetrdm/initialize_repo.py @@ -49,7 +49,7 @@ def initialize_repo(path_to_repo: str, output_folder_name: (str | bool) = "outpu :param gitignore: List of files to be added to the gitignore file. :param gitattributes: - List of lines to be added to the gittatributes file + List of lines to be added to the gitattributes file :param lfs_filetypes: List of filetypes to be handled by git lfs. :param output_repo_kwargs: @@ -98,7 +98,7 @@ def initialize_repo(path_to_repo: str, output_folder_name: (str | bool) = "outpu write_lines_to_file(path=".gitignore", lines=gitignore, open_type="a") if output_repo_kwargs is None: - output_repo_kwargs = {"gitattributes": ["log.csv merge=union"]} + output_repo_kwargs = {"gitattributes": ["rmd-log.tsv merge=union"]} if output_folder_name: # This means we are in the project repo and should now initialize the output_repo diff --git a/cadetrdm/repositories.py b/cadetrdm/repositories.py index 516e187193b23f5ef0070bf86b44573e1651618c..455424e16cd98c44123f3432e612eff1a6363bd7 100644 --- a/cadetrdm/repositories.py +++ b/cadetrdm/repositories.py @@ -13,7 +13,7 @@ from urllib.request import urlretrieve from tabulate import tabulate import pandas as pd -from cadetrdm.io_utils import recursive_chmod +from cadetrdm.io_utils import recursive_chmod, write_lines_to_file try: import git @@ -98,7 +98,7 @@ class BaseRepo: @property def tags(self): - return None + return list() @property def data_json_path(self): @@ -471,14 +471,14 @@ class BaseRepo: - checking out the master branch, - creating a new branch from there This thereby produces a clear, empty directory for data, while still maintaining - .gitignore and .gitatributes + .gitignore and .gitattributes :param branch_name: Name of the new branch. """ self._git.checkout("master") self._git.checkout('-b', branch_name) # equivalent to $ git checkout -b %branch_name code_backup_path = os.path.join(self.working_dir, "run_history") - logs_path = os.path.join(self.working_dir, "log.csv") + logs_path = os.path.join(self.working_dir, "log.tsv") if os.path.exists(code_backup_path): try: # Remove previous code backup @@ -617,9 +617,11 @@ class ProjectRepo(BaseRepo): self.output_repo.checkout("master") - csv_filepath = os.path.join(self.working_dir, self.output_folder, "log.csv") + self.convert_csv_to_tsv_if_necessary() + + tsv_filepath = os.path.join(self.working_dir, self.output_folder, "log.tsv") - df = pd.read_csv(csv_filepath, sep=",", header=0) + df = pd.read_csv(tsv_filepath, sep="\t", header=0) # Clean up the headers df = df.rename(columns={"Output repo commit message": 'Output commit message', "Output repo branch": "Output branch", @@ -638,6 +640,33 @@ class ProjectRepo(BaseRepo): self.output_repo.checkout(self.output_repo._most_recent_branch) + def convert_csv_to_tsv_if_necessary(self): + """ + If not tsv log is found AND a csv log is found, convert the csv to tsv. + + :return: + """ + tsv_filepath = os.path.join(self.working_dir, self.output_folder, "log.tsv") + if os.path.exists(tsv_filepath): + return + + csv_filepath = os.path.join(self.working_dir, self.output_folder, "log.csv") + if not os.path.exists(csv_filepath): + # We have just initialized the repo and neither tsv nor csv exist. + return + + with open(csv_filepath) as csv_handle: + csv_lines = csv_handle.readlines() + + tsv_lines = [line.replace(",", "\t") for line in csv_lines] + + with open(tsv_filepath, "w") as f: + f.writelines(tsv_lines) + + write_lines_to_file(path=os.path.join(self.working_dir, ".gitattributes"), + lines=["rmd-log.tsv merge=union"], + open_type="a") + def update_output_master_logs(self, ): """ Dumps all the metadata information about the project repositories state and @@ -657,9 +686,10 @@ class ProjectRepo(BaseRepo): os.makedirs(logs_folderpath) json_filepath = os.path.join(logs_folderpath, "metadata.json") - # note: if filename of "log.csv" is changed, + # note: if filename of "log.tsv" is changed, # this also has to be changed in the gitattributes of the init repo func - csv_filepath = os.path.join(self.output_repo.working_dir, "log.csv") + tsv_filepath = os.path.join(self.output_repo.working_dir, "log.tsv") + self.convert_csv_to_tsv_if_necessary() meta_info_dict = { "Output repo commit message": output_commit_message, @@ -669,25 +699,25 @@ class ProjectRepo(BaseRepo): "Project repo folder name": os.path.split(self.working_dir)[-1], "Project repo remotes": self.remote_urls, "Python sys args": str(sys.argv), - "Tags": self.tags, + "Tags": ", ".join(self.tags), } - csv_header = ",".join(meta_info_dict.keys()) - csv_data = ",".join([str(x) for x in meta_info_dict.values()]) + csv_header = "\t".join(meta_info_dict.keys()) + csv_data = "\t".join([str(x) for x in meta_info_dict.values()]) with open(json_filepath, "w") as f: json.dump(meta_info_dict, f, indent=2) - if not os.path.exists(csv_filepath): - with open(csv_filepath, "w") as f: + if not os.path.exists(tsv_filepath): + with open(tsv_filepath, "w") as f: f.write(csv_header + "\n") # csv.writer(csv_header + "\n") - with open(csv_filepath, "r") as f: + with open(tsv_filepath, "r") as f: existing_header = f.readline().replace("\n", "") if existing_header != csv_header: - raise ValueError("The used structure of the meta_dict doesn't match the header found in log.csv") + raise ValueError("The used structure of the meta_dict doesn't match the header found in log.tsv") - with open(csv_filepath, "a") as f: + with open(tsv_filepath, "a") as f: f.write(csv_data + "\n") self.dump_package_list(logs_folderpath)