Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Relocate PCD .cif files that have no atomic site/label while preprocessing each .cif file #75

Merged
merged 4 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions news/format-PCD.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
**Added:**

* Relocate PCD .cif files that have no atomic site/label while preprocessing each .cif file.

**Changed:**

* <news item>

**Deprecated:**

* <news item>

**Removed:**

* <news item>

**Fixed:**

* <news item>

**Security:**

* <news item>
2 changes: 1 addition & 1 deletion src/cifkit/models/cif_ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(
self._log_info(CifEnsembleLog.PREPROCESSING.value)
for file_path in file_paths:
edit_cif_file_based_on_db(file_path)
# Move ill-formatted files after processing
# Move ill-formatted files after pre-processing
move_files_based_on_errors(cif_dir_path, file_paths)

# Initialize new files after ill-formatted files are moved
Expand Down
50 changes: 30 additions & 20 deletions src/cifkit/preprocessors/error.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,25 @@
from pathlib import Path

from cifkit.models.cif import Cif
from cifkit.preprocessors.format import preprocess_label_element_loop_values
from cifkit.utils.cif_parser import check_unique_atom_site_labels


def make_directory_and_move(file_path, dir_path, new_file_path):
"""Create directory if it doesn't exist and move the file."""
os.makedirs(dir_path, exist_ok=True)
new_file_path = os.path.join(dir_path, new_file_path)
os.rename(file_path, new_file_path)


def move_files_based_on_errors(dir_path, file_paths):
print(f"\nCIF Preprocessing in {dir_path} begun...\n")
def move_files_based_on_errors(cif_dir_path, file_paths):
print(f"\nCIF Preprocessing in {cif_dir_path} begun...\n")

# Ensure dir_path is a Path object
dir_path = Path(dir_path)
cif_dir_path = Path(cif_dir_path)

# Dictionary to hold directory paths for each error type
error_directories = {
"error_operations": dir_path / "error_operations",
"error_duplicate_labels": dir_path / "error_duplicate_labels",
"error_wrong_loop_value": dir_path / "error_wrong_loop_value",
"error_coords": dir_path / "error_coords",
"error_invalid_label": dir_path / "error_invalid_label",
"error_others": dir_path / "error_others",
"error_no_labels": cif_dir_path / "error_no_labels",
"error_operations": cif_dir_path / "error_operations",
"error_duplicate_labels": cif_dir_path / "error_duplicate_labels",
"error_wrong_loop_value": cif_dir_path / "error_wrong_loop_value",
"error_coords": cif_dir_path / "error_coords",
"error_invalid_label": cif_dir_path / "error_invalid_label",
"error_others": cif_dir_path / "error_others",
}

# Ensure all direct
Expand All @@ -35,15 +30,23 @@ def move_files_based_on_errors(dir_path, file_paths):
filename = os.path.basename(file_path)
print(f"Preprocessing {file_path} ({i}/{len(file_paths)})")
try:
# Check the label before instantiating the Cif object to save time
# Attempt to initialize a Cif object and if it is PCD source,
# preprocess the CIF file and identify the error type
cif = Cif(file_path, is_formatted=True)
db_source = cif.db_source
if db_source == "PCD":
# Preprocess the CIF file
preprocess_label_element_loop_values(file_path)
# Check site element can be parsed from site label
check_unique_atom_site_labels(file_path)
# Instantiate the Cif object fully
Cif(file_path, is_formatted=True)

except Exception as e:
error_message = str(e)
# Example of handling specific errors, adjust as needed
if "symmetry operation" in error_message:
error_type = "error_operations"
elif "no atomic label and type" in error_message:
error_type = "error_no_labels"
elif "contains duplicate atom site labels" in error_message:
error_type = "error_duplicate_labels"
elif "Wrong number of values in loop" in error_message:
Expand All @@ -55,7 +58,7 @@ def move_files_based_on_errors(dir_path, file_paths):
else:
error_type = "error_others"

make_directory_and_move(file_path, error_directories[error_type], filename)
_make_directory_and_move(file_path, error_directories[error_type], filename)
num_files_moved[error_type] += 1
print(f"File {filename} moved to '{error_type}' due to: {error_message}")

Expand All @@ -64,3 +67,10 @@ def move_files_based_on_errors(dir_path, file_paths):
for error_type, count in num_files_moved.items():
print(f"# of files moved to '{error_type}' folder: {count}")
print()


def _make_directory_and_move(file_path, dir_path, new_file_path):
"""Create directory if it doesn't exist and move the file."""
os.makedirs(dir_path, exist_ok=True)
new_file_path = os.path.join(dir_path, new_file_path)
os.rename(file_path, new_file_path)
8 changes: 5 additions & 3 deletions src/cifkit/preprocessors/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ def preprocess_label_element_loop_values(file_path: str) -> None:
content_lines = cif_parser.get_line_content_from_tag(
file_path, "_atom_site_occupancy"
)

# Check whether the content line is empty, then throw an error
for line in content_lines:
line = line.strip()
site_label, atom_type_symbol = line.split()[:2]
try:
site_label, atom_type_symbol = line.split()[:2]
except ValueError:
raise ValueError("The file contains no atomic label and type.")
atom_type_from_label = string_parser.get_atom_type_from_label(site_label)

unique_elements = cif_parser.get_unique_elements_from_loop(loop_values)
"""Type 8. Ex) 1817279.cif.

Expand Down
1 change: 1 addition & 0 deletions src/cifkit/utils/cif_editor.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ def edit_cif_file_based_on_db(file_path: str):
add_hashtag_in_first_line(file_path)
elif db_source == "PCD":
remove_author_loop(file_path)
# Preprocessing the label is only tested on PCD files
preprocess_label_element_loop_values(file_path)

check_unique_atom_site_labels(file_path)
2 changes: 1 addition & 1 deletion tests/core/models/test_cif.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ def test_init_error_coord_missing():
with pytest.raises(ValueError) as e:
Cif(file_path)

assert "not enough values to unpack (expected 2, got 1)" in str(e.value)
assert "contains no atomic label and type." in str(e.value)


"""
Expand Down
118 changes: 118 additions & 0 deletions tests/data/cif/error/combined/260486.cif
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
##############################################################################
# #
# Au-In # Au9In4 ht # 260486 #
# #
##############################################################################
# #
# Pearson's Crystal Data #
# Crystal Structure Database for Inorganic Compounds (on DVD) #
# Release 2024/25 #
# Editors: Pierre Villars, Karin Cenzual, and Vitaliy Dubenskyy #
# #
# Copyright (c) ASM International & Material Phases Data System (MPDS), #
# Switzerland & National Institute for Materials Science (NIMS), Japan, 2024 #
# All rights reserved. Version 2024.07 #
# #
# This copy of Pearson's Crystal Data is licensed to: #
# Hunter College - City University of New York #
# #
##############################################################################

data_260486
_audit_creation_date 2024-09-17
_audit_creation_method
;
Pearson's Crystal Data browser
;
#_database_code_PCD 260486
_database_code_PDF ?

# Entry summary

_chemical_formula_structural 'Au~9~ In~4~'
_chemical_formula_sum 'Au9 In4'
_chemical_name_mineral ?
_chemical_compound_source ?
_chemical_name_structure_type Au~6~(Au~0.5~In~0.5~)~6~In,cP76,215
_chemical_formula_weight 2232.0

# Bibliographic data

_publ_section_title
'The gold-indium thin film system: A high resolution electron microscopy study'
_journal_coden_ASTM JCOMAH
_journal_name_full 'J. Less-Common Met.'
_journal_year 1986
_journal_volume 116
_journal_page_first 63
_journal_page_last 72
_journal_language English
loop_
_publ_author_name
_publ_author_address
''
;
;

# Standardized crystallographic data

_cell_length_a 9.84
_cell_length_b 9.84
_cell_length_c 9.84
_cell_angle_alpha 90
_cell_angle_beta 90
_cell_angle_gamma 90
_cell_volume 952.76
_cell_formula_units_Z 4
_space_group_IT_number 215
_space_group_name_H-M_alt 'P -4 3 m'
loop_
_space_group_symop_id
_space_group_symop_operation_xyz
1 'x, y, z'
2 '-x, -y, z'
3 '-x, -z, y'
4 '-x, y, -z'
5 '-x, z, -y'
6 '-y, -x, z'
7 '-y, -z, x'
8 '-y, x, -z'
9 '-y, z, -x'
10 '-z, -x, y'
11 '-z, -y, x'
12 '-z, x, -y'
13 '-z, y, -x'
14 'x, -y, -z'
15 'x, -z, -y'
16 'x, z, y'
17 'y, -x, -z'
18 'y, -z, -x'
19 'y, x, z'
20 'y, z, x'
21 'z, -x, -y'
22 'z, -y, -x'
23 'z, x, y'
24 'z, y, x'


_exptl_crystal_colour ?
_exptl_crystal_density_meas ?
_exptl_crystal_density_diffrn 15.56
_cell_measurement_temperature ?
_cell_measurement_radiation electrons
_cell_measurement_reflns_used ?
_diffrn_ambient_temperature ?
_diffrn_measurement_device ?
_diffrn_measurement_device_type ?
_diffrn_radiation_type ?
_diffrn_reflns_number ?
_exptl_absorpt_coefficient_mu ?
_exptl_absorpt_correction_type ?
_computing_structure_solution ?
_refine_ls_number_parameters ?
_refine_ls_number_reflns ?
_refine_ls_R_factor_gt ?
_refine_ls_wR_factor_gt ?

# End of data set 260486

Loading