Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use or not use junk tags #1502

Merged
merged 2 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion libsys_airflow/dags/data_exports/full_dump_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@
type="integer",
description="Number of batch processing jobs to run in parallel.",
),
"exclude_tags": Param(
True,
type="boolean",
description="Remove excluded tags listed in marc/excluded_tags.pyfrom incoming record.",
),
},
) as dag:

Expand Down Expand Up @@ -131,8 +136,13 @@ def transform_marc_records_add_holdings(marc_files: list):

@task
def transform_marc_records_clean_serialize(marc_files: list):
context = get_current_context()
params = context.get("params", {}) # type: ignore
exclude_tags = params.get("exclude_tags", True)
for marc_file in marc_files:
marc_clean_serialize(marc_file, full_dump=True)
marc_clean_serialize(
marc_file, full_dump=True, exclude_tags=exclude_tags
)

@task
def compress_marc_files(marc_files: list):
Expand Down
24 changes: 12 additions & 12 deletions libsys_airflow/plugins/data_exports/marc/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,13 +83,13 @@ def leader_for_deletes(marc_file: str, full_dump: bool):
def clean_and_serialize_marc_files(marc_file_list: dict):
for kind, file_list in marc_file_list.items():
for filepath in file_list:
marc_clean_serialize(filepath, False)
marc_clean_serialize(filepath, False, True)
logger.info(
f"Removed MARC fields and serialized records for '{kind}' files: {filepath}"
)


def marc_clean_serialize(marc_file: str, full_dump: bool):
def marc_clean_serialize(marc_file: str, full_dump: bool, exclude_tags: bool):
"""
Removes MARC fields from export MARC21 file
"""
Expand All @@ -101,16 +101,16 @@ def marc_clean_serialize(marc_file: str, full_dump: bool):
with marc_path.open('rb') as fo:
marc_records = [record for record in pymarc.MARCReader(fo)]

logger.info(f"Removing MARC fields for {len(marc_records):,} records")

for i, record in enumerate(marc_records):
try:
record.remove_fields(*excluded_tags)
if not i % 100:
logger.info(f"{i:,} records processed")
except AttributeError as e:
logger.warning(e)
continue
if exclude_tags:
logger.info(f"Removing MARC fields for {len(marc_records):,} records")
for i, record in enumerate(marc_records):
try:
record.remove_fields(*excluded_tags)
if not i % 100:
logger.info(f"{i:,} records processed")
except AttributeError as e:
logger.warning(e)
continue

"""
Writes the records back to the filesystem
Expand Down
34 changes: 31 additions & 3 deletions tests/data_exports/test_marc_transformations.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,8 +439,7 @@ def test_clean_and_serialize_marc_files(mock_marc_dir, caplog):
assert f"Removed MARC fields and serialized records for updates files: {str(marc_file)}"


@pytest.mark.parametrize("mock_marc_dir", ["vendor"], indirect=True)
def test_marc_clean_serialize(mock_marc_dir):
def setup_marc_file_for_clean_serialize(mock_marc_dir):
record = pymarc.Record()
record.add_field(
pymarc.Field(
Expand Down Expand Up @@ -471,7 +470,15 @@ def test_marc_clean_serialize(mock_marc_dir):
marc_writer = pymarc.MARCWriter(fo)
marc_writer.write(record)

marc_clean_serialize(str(marc_file.absolute()), full_dump=False)
return marc_file


@pytest.mark.parametrize("mock_marc_dir", ["vendor"], indirect=True)
def test_marc_clean_serialize(mock_marc_dir):

marc_file = setup_marc_file_for_clean_serialize(mock_marc_dir)

marc_clean_serialize(str(marc_file.absolute()), full_dump=False, exclude_tags=True)

with marc_file.open('rb') as fo:
marc_reader = pymarc.MARCReader(fo)
Expand All @@ -489,6 +496,27 @@ def test_marc_clean_serialize(mock_marc_dir):
assert pathlib.Path(xml_file).stat().st_size > 0


@pytest.mark.parametrize("mock_marc_dir", ["vendor"], indirect=True)
def test_marc_no_clean_serialize(mock_marc_dir):

marc_file = setup_marc_file_for_clean_serialize(mock_marc_dir)

marc_clean_serialize(str(marc_file.absolute()), full_dump=False, exclude_tags=False)

with marc_file.open('rb') as fo:
marc_reader = pymarc.MARCReader(fo)
modified_marc_record = next(marc_reader)

current_fields = [field.tag for field in modified_marc_record.fields]

assert "598" in current_fields
assert "699" in current_fields

xml_file = mock_marc_dir / "20240228.xml"

assert pathlib.Path(xml_file).stat().st_size > 0


@pytest.mark.parametrize("mock_marc_dir", ["vendor"], indirect=True)
def test_change_leader(mock_marc_dir):
marc_file = mock_marc_dir / "20240509.mrc"
Expand Down