Release commit

Signed-off-by: Stephen Augustus <saugustus2@bloomberg.net>
This commit is contained in:
j-min
2025-01-30 17:04:56 -05:00
committed by oir
parent e04aeadfb0
commit 27aac8d521
50 changed files with 5692 additions and 0 deletions

View File

@ -0,0 +1,107 @@
# Copyright 2024 Bloomberg Finance L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from pathlib import Path
import shutil
import json
import jsonlines
from unittest.mock import MagicMock, patch
from m3docvqa.split_utils import create_split_dirs
@pytest.fixture
def mock_pdf_directory(tmp_path):
# Create a temporary directory for PDFs
pdf_dir = tmp_path / "pdfs"
pdf_dir.mkdir()
# Add some mock PDF files
(pdf_dir / "doc1.pdf").write_text("PDF content for doc1")
(pdf_dir / "doc2.pdf").write_text("PDF content for doc2")
return pdf_dir
@pytest.fixture
def mock_metadata_file(tmp_path):
# Create a temporary metadata file in JSONL format
metadata_file = tmp_path / "MMQA_train.jsonl"
data = [
{"supporting_context": [{"doc_id": "doc1"}]},
{"supporting_context": [{"doc_id": "doc2"}]}
]
with jsonlines.open(metadata_file, mode='w') as writer:
writer.write_all(data)
return metadata_file
@pytest.fixture
def mock_target_directory(tmp_path):
return tmp_path / "target"
def test_create_split_dirs(mock_pdf_directory, mock_metadata_file, mock_target_directory):
"""Test the create_split_dirs function."""
# Prepare the split directory
split = "train"
# Call the function to create split directories
create_split_dirs(
all_pdf_dir=mock_pdf_directory,
target_dir_base=mock_target_directory,
split_metadata_file=mock_metadata_file,
split=split
)
# Assert that the target directory exists and contains the expected PDF files
target_dir = mock_target_directory / f"pdfs_{split}"
assert target_dir.exists(), f"Directory {target_dir} was not created"
assert (target_dir / "doc1.pdf").exists(), "doc1.pdf was not copied"
assert (target_dir / "doc2.pdf").exists(), "doc2.pdf was not copied"
def test_create_split_dirs_missing_pdf(mock_metadata_file, mock_target_directory):
"""Test create_split_dirs when PDF files are missing."""
# Prepare the split directory
split = "train"
all_pdf_dir = Path("non_existing_pdf_dir")
# Call the function and verify that the missing PDFs are handled correctly
create_split_dirs(
all_pdf_dir=all_pdf_dir,
target_dir_base=mock_target_directory,
split_metadata_file=mock_metadata_file,
split=split
)
target_dir = mock_target_directory / f"pdfs_{split}"
assert target_dir.exists(), f"Directory {target_dir} was not created"
assert not (target_dir / "doc1.pdf").exists(), "doc1.pdf should not exist"
assert not (target_dir / "doc2.pdf").exists(), "doc2.pdf should not exist"
@pytest.mark.parametrize("split, expected_error", [
("test_split", ValueError), # Invalid split type
(None, ValueError), # Missing split
])
def test_create_split_dirs_invalid_split_type(mock_pdf_directory, mock_metadata_file, mock_target_directory, split, expected_error):
"""Test invalid split types in create_split_dirs."""
with pytest.raises(expected_error):
create_split_dirs(
all_pdf_dir=mock_pdf_directory,
target_dir_base=mock_target_directory,
split_metadata_file=mock_metadata_file,
split=split
)