From ee0e135ab6f1e9fd528af7244e4b870759619bc2 Mon Sep 17 00:00:00 2001 From: Sangjoon Bob Lee Date: Fri, 24 Jan 2025 06:26:40 -0500 Subject: [PATCH 1/4] feat: run preprocess step while moving files based on errors to move cif files without site/label --- src/cifkit/preprocessors/error.py | 10 +- src/cifkit/preprocessors/format.py | 8 +- src/cifkit/utils/cif_editor.py | 1 + tests/data/cif/error/combined/260486.cif | 118 +++++++++++++++++++++++ 4 files changed, 132 insertions(+), 5 deletions(-) create mode 100644 tests/data/cif/error/combined/260486.cif diff --git a/src/cifkit/preprocessors/error.py b/src/cifkit/preprocessors/error.py index b912739..07f13b6 100644 --- a/src/cifkit/preprocessors/error.py +++ b/src/cifkit/preprocessors/error.py @@ -2,6 +2,7 @@ from pathlib import Path from cifkit.models.cif import Cif +from cifkit.preprocessors.format import preprocess_label_element_loop_values from cifkit.utils.cif_parser import check_unique_atom_site_labels @@ -20,6 +21,7 @@ def move_files_based_on_errors(dir_path, file_paths): # Dictionary to hold directory paths for each error type error_directories = { + "error_no_labels": dir_path / "error_no_labels", "error_operations": dir_path / "error_operations", "error_duplicate_labels": dir_path / "error_duplicate_labels", "error_wrong_loop_value": dir_path / "error_wrong_loop_value", @@ -35,15 +37,19 @@ def move_files_based_on_errors(dir_path, file_paths): filename = os.path.basename(file_path) print(f"Preprocessing {file_path} ({i}/{len(file_paths)})") try: - # Check the label before instantiating the Cif object to save time + # Preprocess the CIF file + preprocess_label_element_loop_values(file_path) + # Check site element can be parsed from site label check_unique_atom_site_labels(file_path) - # Instantiate the Cif object fully + # Attempt to initialize a Cif object Cif(file_path, is_formatted=True) except Exception as e: error_message = str(e) # Example of handling specific errors, adjust as needed if "symmetry operation" in error_message: error_type = "error_operations" + elif "no atomic label and type" in error_message: + error_type = "error_no_labels" elif "contains duplicate atom site labels" in error_message: error_type = "error_duplicate_labels" elif "Wrong number of values in loop" in error_message: diff --git a/src/cifkit/preprocessors/format.py b/src/cifkit/preprocessors/format.py index 0e6ae94..beeda75 100644 --- a/src/cifkit/preprocessors/format.py +++ b/src/cifkit/preprocessors/format.py @@ -18,12 +18,14 @@ def preprocess_label_element_loop_values(file_path: str) -> None: content_lines = cif_parser.get_line_content_from_tag( file_path, "_atom_site_occupancy" ) - + # Check whether the content line is empty, then throw an error for line in content_lines: line = line.strip() - site_label, atom_type_symbol = line.split()[:2] + try: + site_label, atom_type_symbol = line.split()[:2] + except ValueError: + raise ValueError("The file contains no atomic label and type.") atom_type_from_label = string_parser.get_atom_type_from_label(site_label) - unique_elements = cif_parser.get_unique_elements_from_loop(loop_values) """Type 8. Ex) 1817279.cif. diff --git a/src/cifkit/utils/cif_editor.py b/src/cifkit/utils/cif_editor.py index 832833d..5cb1fcf 100644 --- a/src/cifkit/utils/cif_editor.py +++ b/src/cifkit/utils/cif_editor.py @@ -63,6 +63,7 @@ def edit_cif_file_based_on_db(file_path: str): add_hashtag_in_first_line(file_path) elif db_source == "PCD": remove_author_loop(file_path) + # Preprocessing the label is only tested on PCD files preprocess_label_element_loop_values(file_path) check_unique_atom_site_labels(file_path) diff --git a/tests/data/cif/error/combined/260486.cif b/tests/data/cif/error/combined/260486.cif new file mode 100644 index 0000000..70d4830 --- /dev/null +++ b/tests/data/cif/error/combined/260486.cif @@ -0,0 +1,118 @@ +############################################################################## +# # +# Au-In # Au9In4 ht # 260486 # +# # +############################################################################## +# # +# Pearson's Crystal Data # +# Crystal Structure Database for Inorganic Compounds (on DVD) # +# Release 2024/25 # +# Editors: Pierre Villars, Karin Cenzual, and Vitaliy Dubenskyy # +# # +# Copyright (c) ASM International & Material Phases Data System (MPDS), # +# Switzerland & National Institute for Materials Science (NIMS), Japan, 2024 # +# All rights reserved. Version 2024.07 # +# # +# This copy of Pearson's Crystal Data is licensed to: # +# Hunter College - City University of New York # +# # +############################################################################## + +data_260486 +_audit_creation_date 2024-09-17 +_audit_creation_method +; +Pearson's Crystal Data browser +; +#_database_code_PCD 260486 +_database_code_PDF ? + +# Entry summary + +_chemical_formula_structural 'Au~9~ In~4~' +_chemical_formula_sum 'Au9 In4' +_chemical_name_mineral ? +_chemical_compound_source ? +_chemical_name_structure_type Au~6~(Au~0.5~In~0.5~)~6~In,cP76,215 +_chemical_formula_weight 2232.0 + +# Bibliographic data + +_publ_section_title +'The gold-indium thin film system: A high resolution electron microscopy study' +_journal_coden_ASTM JCOMAH +_journal_name_full 'J. Less-Common Met.' +_journal_year 1986 +_journal_volume 116 +_journal_page_first 63 +_journal_page_last 72 +_journal_language English +loop_ + _publ_author_name + _publ_author_address +'' +; +; + +# Standardized crystallographic data + +_cell_length_a 9.84 +_cell_length_b 9.84 +_cell_length_c 9.84 +_cell_angle_alpha 90 +_cell_angle_beta 90 +_cell_angle_gamma 90 +_cell_volume 952.76 +_cell_formula_units_Z 4 +_space_group_IT_number 215 +_space_group_name_H-M_alt 'P -4 3 m' +loop_ + _space_group_symop_id + _space_group_symop_operation_xyz + 1 'x, y, z' + 2 '-x, -y, z' + 3 '-x, -z, y' + 4 '-x, y, -z' + 5 '-x, z, -y' + 6 '-y, -x, z' + 7 '-y, -z, x' + 8 '-y, x, -z' + 9 '-y, z, -x' + 10 '-z, -x, y' + 11 '-z, -y, x' + 12 '-z, x, -y' + 13 '-z, y, -x' + 14 'x, -y, -z' + 15 'x, -z, -y' + 16 'x, z, y' + 17 'y, -x, -z' + 18 'y, -z, -x' + 19 'y, x, z' + 20 'y, z, x' + 21 'z, -x, -y' + 22 'z, -y, -x' + 23 'z, x, y' + 24 'z, y, x' + + +_exptl_crystal_colour ? +_exptl_crystal_density_meas ? +_exptl_crystal_density_diffrn 15.56 +_cell_measurement_temperature ? +_cell_measurement_radiation electrons +_cell_measurement_reflns_used ? +_diffrn_ambient_temperature ? +_diffrn_measurement_device ? +_diffrn_measurement_device_type ? +_diffrn_radiation_type ? +_diffrn_reflns_number ? +_exptl_absorpt_coefficient_mu ? +_exptl_absorpt_correction_type ? +_computing_structure_solution ? +_refine_ls_number_parameters ? +_refine_ls_number_reflns ? +_refine_ls_R_factor_gt ? +_refine_ls_wR_factor_gt ? + +# End of data set 260486 + From 20718a46a0d94c8781084b7b8a28f26dfb3a15e3 Mon Sep 17 00:00:00 2001 From: Sangjoon Bob Lee Date: Fri, 24 Jan 2025 06:37:01 -0500 Subject: [PATCH 2/4] feat: Relocate PCD .cif files that have no atomic site/label while preprocesing each .cif file --- news/format-PCD.rst | 23 +++++++++++++++++++++++ src/cifkit/preprocessors/error.py | 12 ++++++++---- tests/core/models/test_cif.py | 2 +- 3 files changed, 32 insertions(+), 5 deletions(-) create mode 100644 news/format-PCD.rst diff --git a/news/format-PCD.rst b/news/format-PCD.rst new file mode 100644 index 0000000..3e98435 --- /dev/null +++ b/news/format-PCD.rst @@ -0,0 +1,23 @@ +**Added:** + +* Relocate PCD .cif files that have no atomic site/label while preprocesing each .cif file. + +**Changed:** + +* + +**Deprecated:** + +* + +**Removed:** + +* + +**Fixed:** + +* + +**Security:** + +* diff --git a/src/cifkit/preprocessors/error.py b/src/cifkit/preprocessors/error.py index 07f13b6..8c54c07 100644 --- a/src/cifkit/preprocessors/error.py +++ b/src/cifkit/preprocessors/error.py @@ -37,12 +37,16 @@ def move_files_based_on_errors(dir_path, file_paths): filename = os.path.basename(file_path) print(f"Preprocessing {file_path} ({i}/{len(file_paths)})") try: - # Preprocess the CIF file - preprocess_label_element_loop_values(file_path) + # Attempt to initialize a Cif object and if it is PCD source, + # preprocess the CIF file and identify the error type + cif = Cif(file_path, is_formatted=True) + db_source = cif.db_source + if db_source == "PCD": + # Preprocess the CIF file + preprocess_label_element_loop_values(file_path) # Check site element can be parsed from site label check_unique_atom_site_labels(file_path) - # Attempt to initialize a Cif object - Cif(file_path, is_formatted=True) + except Exception as e: error_message = str(e) # Example of handling specific errors, adjust as needed diff --git a/tests/core/models/test_cif.py b/tests/core/models/test_cif.py index 72c1b00..42ead56 100644 --- a/tests/core/models/test_cif.py +++ b/tests/core/models/test_cif.py @@ -566,7 +566,7 @@ def test_init_error_coord_missing(): with pytest.raises(ValueError) as e: Cif(file_path) - assert "not enough values to unpack (expected 2, got 1)" in str(e.value) + assert "contains no atomic label and type." in str(e.value) """ From 2f2972b111417c886fecd518314730168d0b9c19 Mon Sep 17 00:00:00 2001 From: Sangjoon Bob Lee Date: Fri, 24 Jan 2025 10:33:22 -0500 Subject: [PATCH 3/4] relocate PCD files when no labels/sites are found --- src/cifkit/models/cif_ensemble.py | 2 +- src/cifkit/preprocessors/error.py | 35 +++++++++++++++---------------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/cifkit/models/cif_ensemble.py b/src/cifkit/models/cif_ensemble.py index bee7b77..3b0d28b 100644 --- a/src/cifkit/models/cif_ensemble.py +++ b/src/cifkit/models/cif_ensemble.py @@ -64,7 +64,7 @@ def __init__( self._log_info(CifEnsembleLog.PREPROCESSING.value) for file_path in file_paths: edit_cif_file_based_on_db(file_path) - # Move ill-formatted files after processing + # Move ill-formatted files after pre-processing move_files_based_on_errors(cif_dir_path, file_paths) # Initialize new files after ill-formatted files are moved diff --git a/src/cifkit/preprocessors/error.py b/src/cifkit/preprocessors/error.py index 8c54c07..083cb22 100644 --- a/src/cifkit/preprocessors/error.py +++ b/src/cifkit/preprocessors/error.py @@ -6,28 +6,21 @@ from cifkit.utils.cif_parser import check_unique_atom_site_labels -def make_directory_and_move(file_path, dir_path, new_file_path): - """Create directory if it doesn't exist and move the file.""" - os.makedirs(dir_path, exist_ok=True) - new_file_path = os.path.join(dir_path, new_file_path) - os.rename(file_path, new_file_path) - - -def move_files_based_on_errors(dir_path, file_paths): - print(f"\nCIF Preprocessing in {dir_path} begun...\n") +def move_files_based_on_errors(cif_dir_path, file_paths): + print(f"\nCIF Preprocessing in {cif_dir_path} begun...\n") # Ensure dir_path is a Path object - dir_path = Path(dir_path) + cif_dir_path = Path(cif_dir_path) # Dictionary to hold directory paths for each error type error_directories = { - "error_no_labels": dir_path / "error_no_labels", - "error_operations": dir_path / "error_operations", - "error_duplicate_labels": dir_path / "error_duplicate_labels", - "error_wrong_loop_value": dir_path / "error_wrong_loop_value", - "error_coords": dir_path / "error_coords", - "error_invalid_label": dir_path / "error_invalid_label", - "error_others": dir_path / "error_others", + "error_no_labels": cif_dir_path / "error_no_labels", + "error_operations": cif_dir_path / "error_operations", + "error_duplicate_labels": cif_dir_path / "error_duplicate_labels", + "error_wrong_loop_value": cif_dir_path / "error_wrong_loop_value", + "error_coords": cif_dir_path / "error_coords", + "error_invalid_label": cif_dir_path / "error_invalid_label", + "error_others": cif_dir_path / "error_others", } # Ensure all direct @@ -65,7 +58,7 @@ def move_files_based_on_errors(dir_path, file_paths): else: error_type = "error_others" - make_directory_and_move(file_path, error_directories[error_type], filename) + _make_directory_and_move(file_path, error_directories[error_type], filename) num_files_moved[error_type] += 1 print(f"File {filename} moved to '{error_type}' due to: {error_message}") @@ -74,3 +67,9 @@ def move_files_based_on_errors(dir_path, file_paths): for error_type, count in num_files_moved.items(): print(f"# of files moved to '{error_type}' folder: {count}") print() + +def _make_directory_and_move(file_path, dir_path, new_file_path): + """Create directory if it doesn't exist and move the file.""" + os.makedirs(dir_path, exist_ok=True) + new_file_path = os.path.join(dir_path, new_file_path) + os.rename(file_path, new_file_path) From fb0a4d9ebdb4816223980894bf767a535363d6a3 Mon Sep 17 00:00:00 2001 From: Sangjoon Bob Lee Date: Fri, 24 Jan 2025 10:34:09 -0500 Subject: [PATCH 4/4] chore: apply pre-commit --- news/format-PCD.rst | 2 +- src/cifkit/preprocessors/error.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/news/format-PCD.rst b/news/format-PCD.rst index 3e98435..6b98577 100644 --- a/news/format-PCD.rst +++ b/news/format-PCD.rst @@ -1,6 +1,6 @@ **Added:** -* Relocate PCD .cif files that have no atomic site/label while preprocesing each .cif file. +* Relocate PCD .cif files that have no atomic site/label while preprocessing each .cif file. **Changed:** diff --git a/src/cifkit/preprocessors/error.py b/src/cifkit/preprocessors/error.py index 083cb22..7a8e397 100644 --- a/src/cifkit/preprocessors/error.py +++ b/src/cifkit/preprocessors/error.py @@ -68,6 +68,7 @@ def move_files_based_on_errors(cif_dir_path, file_paths): print(f"# of files moved to '{error_type}' folder: {count}") print() + def _make_directory_and_move(file_path, dir_path, new_file_path): """Create directory if it doesn't exist and move the file.""" os.makedirs(dir_path, exist_ok=True)