Update M3DocVQA download (#7)
This commit is contained in:
@ -1,107 +0,0 @@
|
||||
# Copyright 2024 Bloomberg Finance L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import json
|
||||
import jsonlines
|
||||
from unittest.mock import MagicMock, patch
|
||||
from m3docvqa.split_utils import create_split_dirs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_pdf_directory(tmp_path):
|
||||
# Create a temporary directory for PDFs
|
||||
pdf_dir = tmp_path / "pdfs"
|
||||
pdf_dir.mkdir()
|
||||
# Add some mock PDF files
|
||||
(pdf_dir / "doc1.pdf").write_text("PDF content for doc1")
|
||||
(pdf_dir / "doc2.pdf").write_text("PDF content for doc2")
|
||||
return pdf_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_metadata_file(tmp_path):
|
||||
# Create a temporary metadata file in JSONL format
|
||||
metadata_file = tmp_path / "MMQA_train.jsonl"
|
||||
data = [
|
||||
{"supporting_context": [{"doc_id": "doc1"}]},
|
||||
{"supporting_context": [{"doc_id": "doc2"}]}
|
||||
]
|
||||
with jsonlines.open(metadata_file, mode='w') as writer:
|
||||
writer.write_all(data)
|
||||
return metadata_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_target_directory(tmp_path):
|
||||
return tmp_path / "target"
|
||||
|
||||
|
||||
def test_create_split_dirs(mock_pdf_directory, mock_metadata_file, mock_target_directory):
|
||||
"""Test the create_split_dirs function."""
|
||||
# Prepare the split directory
|
||||
split = "train"
|
||||
|
||||
# Call the function to create split directories
|
||||
create_split_dirs(
|
||||
all_pdf_dir=mock_pdf_directory,
|
||||
target_dir_base=mock_target_directory,
|
||||
split_metadata_file=mock_metadata_file,
|
||||
split=split
|
||||
)
|
||||
|
||||
# Assert that the target directory exists and contains the expected PDF files
|
||||
target_dir = mock_target_directory / f"pdfs_{split}"
|
||||
assert target_dir.exists(), f"Directory {target_dir} was not created"
|
||||
assert (target_dir / "doc1.pdf").exists(), "doc1.pdf was not copied"
|
||||
assert (target_dir / "doc2.pdf").exists(), "doc2.pdf was not copied"
|
||||
|
||||
|
||||
def test_create_split_dirs_missing_pdf(mock_metadata_file, mock_target_directory):
|
||||
"""Test create_split_dirs when PDF files are missing."""
|
||||
# Prepare the split directory
|
||||
split = "train"
|
||||
all_pdf_dir = Path("non_existing_pdf_dir")
|
||||
|
||||
# Call the function and verify that the missing PDFs are handled correctly
|
||||
create_split_dirs(
|
||||
all_pdf_dir=all_pdf_dir,
|
||||
target_dir_base=mock_target_directory,
|
||||
split_metadata_file=mock_metadata_file,
|
||||
split=split
|
||||
)
|
||||
|
||||
target_dir = mock_target_directory / f"pdfs_{split}"
|
||||
assert target_dir.exists(), f"Directory {target_dir} was not created"
|
||||
assert not (target_dir / "doc1.pdf").exists(), "doc1.pdf should not exist"
|
||||
assert not (target_dir / "doc2.pdf").exists(), "doc2.pdf should not exist"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("split, expected_error", [
|
||||
("test_split", ValueError), # Invalid split type
|
||||
(None, ValueError), # Missing split
|
||||
])
|
||||
def test_create_split_dirs_invalid_split_type(mock_pdf_directory, mock_metadata_file, mock_target_directory, split, expected_error):
|
||||
"""Test invalid split types in create_split_dirs."""
|
||||
with pytest.raises(expected_error):
|
||||
create_split_dirs(
|
||||
all_pdf_dir=mock_pdf_directory,
|
||||
target_dir_base=mock_target_directory,
|
||||
split_metadata_file=mock_metadata_file,
|
||||
split=split
|
||||
)
|
Reference in New Issue
Block a user