Skip to content

Commit

Permalink
Merge pull request #75 from bobleesj/format-unpack
Browse files Browse the repository at this point in the history
Relocate PCD .cif files that have no atomic site/label while preprocessing each .cif file
  • Loading branch information
bobleesj authored Jan 24, 2025
2 parents e5fd787 + fb0a4d9 commit 1c11a56
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 25 deletions.
23 changes: 23 additions & 0 deletions news/format-PCD.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
**Added:**

* Relocate PCD .cif files that have no atomic site/label while preprocessing each .cif file.

**Changed:**

* <news item>

**Deprecated:**

* <news item>

**Removed:**

* <news item>

**Fixed:**

* <news item>

**Security:**

* <news item>
2 changes: 1 addition & 1 deletion src/cifkit/models/cif_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(
self._log_info(CifEnsembleLog.PREPROCESSING.value)
for file_path in file_paths:
edit_cif_file_based_on_db(file_path)
# Move ill-formatted files after processing
# Move ill-formatted files after pre-processing
move_files_based_on_errors(cif_dir_path, file_paths)

# Initialize new files after ill-formatted files are moved
Expand Down
50 changes: 30 additions & 20 deletions src/cifkit/preprocessors/error.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,25 @@
from pathlib import Path

from cifkit.models.cif import Cif
from cifkit.preprocessors.format import preprocess_label_element_loop_values
from cifkit.utils.cif_parser import check_unique_atom_site_labels


def make_directory_and_move(file_path, dir_path, new_file_path):
"""Create directory if it doesn't exist and move the file."""
os.makedirs(dir_path, exist_ok=True)
new_file_path = os.path.join(dir_path, new_file_path)
os.rename(file_path, new_file_path)


def move_files_based_on_errors(dir_path, file_paths):
print(f"\nCIF Preprocessing in {dir_path} begun...\n")
def move_files_based_on_errors(cif_dir_path, file_paths):
print(f"\nCIF Preprocessing in {cif_dir_path} begun...\n")

# Ensure dir_path is a Path object
dir_path = Path(dir_path)
cif_dir_path = Path(cif_dir_path)

# Dictionary to hold directory paths for each error type
error_directories = {
"error_operations": dir_path / "error_operations",
"error_duplicate_labels": dir_path / "error_duplicate_labels",
"error_wrong_loop_value": dir_path / "error_wrong_loop_value",
"error_coords": dir_path / "error_coords",
"error_invalid_label": dir_path / "error_invalid_label",
"error_others": dir_path / "error_others",
"error_no_labels": cif_dir_path / "error_no_labels",
"error_operations": cif_dir_path / "error_operations",
"error_duplicate_labels": cif_dir_path / "error_duplicate_labels",
"error_wrong_loop_value": cif_dir_path / "error_wrong_loop_value",
"error_coords": cif_dir_path / "error_coords",
"error_invalid_label": cif_dir_path / "error_invalid_label",
"error_others": cif_dir_path / "error_others",
}

# Ensure all direct
Expand All @@ -35,15 +30,23 @@ def move_files_based_on_errors(dir_path, file_paths):
filename = os.path.basename(file_path)
print(f"Preprocessing {file_path} ({i}/{len(file_paths)})")
try:
# Check the label before instantiating the Cif object to save time
# Attempt to initialize a Cif object and if it is PCD source,
# preprocess the CIF file and identify the error type
cif = Cif(file_path, is_formatted=True)
db_source = cif.db_source
if db_source == "PCD":
# Preprocess the CIF file
preprocess_label_element_loop_values(file_path)
# Check site element can be parsed from site label
check_unique_atom_site_labels(file_path)
# Instantiate the Cif object fully
Cif(file_path, is_formatted=True)

except Exception as e:
error_message = str(e)
# Example of handling specific errors, adjust as needed
if "symmetry operation" in error_message:
error_type = "error_operations"
elif "no atomic label and type" in error_message:
error_type = "error_no_labels"
elif "contains duplicate atom site labels" in error_message:
error_type = "error_duplicate_labels"
elif "Wrong number of values in loop" in error_message:
Expand All @@ -55,7 +58,7 @@ def move_files_based_on_errors(dir_path, file_paths):
else:
error_type = "error_others"

make_directory_and_move(file_path, error_directories[error_type], filename)
_make_directory_and_move(file_path, error_directories[error_type], filename)
num_files_moved[error_type] += 1
print(f"File {filename} moved to '{error_type}' due to: {error_message}")

Expand All @@ -64,3 +67,10 @@ def move_files_based_on_errors(dir_path, file_paths):
for error_type, count in num_files_moved.items():
print(f"# of files moved to '{error_type}' folder: {count}")
print()


def _make_directory_and_move(file_path, dir_path, new_file_path):
"""Create directory if it doesn't exist and move the file."""
os.makedirs(dir_path, exist_ok=True)
new_file_path = os.path.join(dir_path, new_file_path)
os.rename(file_path, new_file_path)
8 changes: 5 additions & 3 deletions src/cifkit/preprocessors/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ def preprocess_label_element_loop_values(file_path: str) -> None:
content_lines = cif_parser.get_line_content_from_tag(
file_path, "_atom_site_occupancy"
)

# Check whether the content line is empty, then throw an error
for line in content_lines:
line = line.strip()
site_label, atom_type_symbol = line.split()[:2]
try:
site_label, atom_type_symbol = line.split()[:2]
except ValueError:
raise ValueError("The file contains no atomic label and type.")
atom_type_from_label = string_parser.get_atom_type_from_label(site_label)

unique_elements = cif_parser.get_unique_elements_from_loop(loop_values)
"""Type 8. Ex) 1817279.cif.
Expand Down
1 change: 1 addition & 0 deletions src/cifkit/utils/cif_editor.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def edit_cif_file_based_on_db(file_path: str):
add_hashtag_in_first_line(file_path)
elif db_source == "PCD":
remove_author_loop(file_path)
# Preprocessing the label is only tested on PCD files
preprocess_label_element_loop_values(file_path)

check_unique_atom_site_labels(file_path)
2 changes: 1 addition & 1 deletion tests/core/models/test_cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ def test_init_error_coord_missing():
with pytest.raises(ValueError) as e:
Cif(file_path)

assert "not enough values to unpack (expected 2, got 1)" in str(e.value)
assert "contains no atomic label and type." in str(e.value)


"""
Expand Down
118 changes: 118 additions & 0 deletions tests/data/cif/error/combined/260486.cif
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
##############################################################################
# #
# Au-In # Au9In4 ht # 260486 #
# #
##############################################################################
# #
# Pearson's Crystal Data #
# Crystal Structure Database for Inorganic Compounds (on DVD) #
# Release 2024/25 #
# Editors: Pierre Villars, Karin Cenzual, and Vitaliy Dubenskyy #
# #
# Copyright (c) ASM International & Material Phases Data System (MPDS), #
# Switzerland & National Institute for Materials Science (NIMS), Japan, 2024 #
# All rights reserved. Version 2024.07 #
# #
# This copy of Pearson's Crystal Data is licensed to: #
# Hunter College - City University of New York #
# #
##############################################################################

data_260486
_audit_creation_date 2024-09-17
_audit_creation_method
;
Pearson's Crystal Data browser
;
#_database_code_PCD 260486
_database_code_PDF ?

# Entry summary

_chemical_formula_structural 'Au~9~ In~4~'
_chemical_formula_sum 'Au9 In4'
_chemical_name_mineral ?
_chemical_compound_source ?
_chemical_name_structure_type Au~6~(Au~0.5~In~0.5~)~6~In,cP76,215
_chemical_formula_weight 2232.0

# Bibliographic data

_publ_section_title
'The gold-indium thin film system: A high resolution electron microscopy study'
_journal_coden_ASTM JCOMAH
_journal_name_full 'J. Less-Common Met.'
_journal_year 1986
_journal_volume 116
_journal_page_first 63
_journal_page_last 72
_journal_language English
loop_
_publ_author_name
_publ_author_address
''
;
;

# Standardized crystallographic data

_cell_length_a 9.84
_cell_length_b 9.84
_cell_length_c 9.84
_cell_angle_alpha 90
_cell_angle_beta 90
_cell_angle_gamma 90
_cell_volume 952.76
_cell_formula_units_Z 4
_space_group_IT_number 215
_space_group_name_H-M_alt 'P -4 3 m'
loop_
_space_group_symop_id
_space_group_symop_operation_xyz
1 'x, y, z'
2 '-x, -y, z'
3 '-x, -z, y'
4 '-x, y, -z'
5 '-x, z, -y'
6 '-y, -x, z'
7 '-y, -z, x'
8 '-y, x, -z'
9 '-y, z, -x'
10 '-z, -x, y'
11 '-z, -y, x'
12 '-z, x, -y'
13 '-z, y, -x'
14 'x, -y, -z'
15 'x, -z, -y'
16 'x, z, y'
17 'y, -x, -z'
18 'y, -z, -x'
19 'y, x, z'
20 'y, z, x'
21 'z, -x, -y'
22 'z, -y, -x'
23 'z, x, y'
24 'z, y, x'


_exptl_crystal_colour ?
_exptl_crystal_density_meas ?
_exptl_crystal_density_diffrn 15.56
_cell_measurement_temperature ?
_cell_measurement_radiation electrons
_cell_measurement_reflns_used ?
_diffrn_ambient_temperature ?
_diffrn_measurement_device ?
_diffrn_measurement_device_type ?
_diffrn_radiation_type ?
_diffrn_reflns_number ?
_exptl_absorpt_coefficient_mu ?
_exptl_absorpt_correction_type ?
_computing_structure_solution ?
_refine_ls_number_parameters ?
_refine_ls_number_reflns ?
_refine_ls_R_factor_gt ?
_refine_ls_wR_factor_gt ?

# End of data set 260486

0 comments on commit 1c11a56

Please sign in to comment.