Release commit
Signed-off-by: Stephen Augustus <saugustus2@bloomberg.net>
This commit is contained in:
105
m3docvqa/tests/test_downloader.py
Normal file
105
m3docvqa/tests/test_downloader.py
Normal file
@ -0,0 +1,105 @@
|
||||
# Copyright 2024 Bloomberg Finance L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
from pathlib import Path
|
||||
import jsonlines
|
||||
from m3docvqa.downloader import _download_wiki_page, download_wiki_page
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def test_urls_and_paths(tmp_path):
|
||||
"""Fixture to provide sample URLs and save paths for testing."""
|
||||
urls = ["https://en.wikipedia.org/wiki/SamplePage1", "https://en.wikipedia.org/wiki/SamplePage2"]
|
||||
save_paths = [str(tmp_path / "sample1.pdf"), str(tmp_path / "sample2.pdf")]
|
||||
return urls, save_paths
|
||||
|
||||
|
||||
@patch("m3docvqa.downloader.sync_playwright")
|
||||
def test__download_wiki_page_pdf(mock_playwright, tmp_path):
|
||||
"""Test downloading a single page as a PDF."""
|
||||
url = "https://en.wikipedia.org/wiki/SamplePage"
|
||||
save_path = tmp_path / "sample.pdf"
|
||||
args = (0, 1, url, str(save_path), 'pdf', 0)
|
||||
|
||||
# Mock Playwright behavior
|
||||
mock_browser = MagicMock()
|
||||
mock_context = MagicMock()
|
||||
mock_page = MagicMock()
|
||||
mock_playwright.return_value.__enter__.return_value.chromium.launch.return_value = mock_browser
|
||||
mock_browser.new_context.return_value = mock_context
|
||||
mock_context.new_page.return_value = mock_page
|
||||
|
||||
# Call the function
|
||||
downloaded, error = _download_wiki_page(args)
|
||||
|
||||
# Assertions
|
||||
assert downloaded is True
|
||||
assert error is None
|
||||
mock_page.goto.assert_called_once_with(url)
|
||||
mock_page.pdf.assert_called_once_with(path=str(save_path))
|
||||
|
||||
|
||||
@patch("m3docvqa.downloader.sync_playwright")
|
||||
def test__download_wiki_page_png(mock_playwright, tmp_path):
|
||||
"""Test downloading a single page as a PNG."""
|
||||
url = "https://en.wikipedia.org/wiki/SamplePage"
|
||||
save_path = tmp_path / "sample.png"
|
||||
args = (0, 1, url, str(save_path), 'png', 0)
|
||||
|
||||
# Mock Playwright behavior
|
||||
mock_browser = MagicMock()
|
||||
mock_context = MagicMock()
|
||||
mock_page = MagicMock()
|
||||
mock_playwright.return_value.__enter__.return_value.chromium.launch.return_value = mock_browser
|
||||
mock_browser.new_context.return_value = mock_context
|
||||
mock_context.new_page.return_value = mock_page
|
||||
|
||||
# Call the function
|
||||
downloaded, error = _download_wiki_page(args)
|
||||
|
||||
# Assertions
|
||||
assert downloaded is True
|
||||
assert error is None
|
||||
mock_page.goto.assert_called_once_with(url)
|
||||
mock_page.screenshot.assert_called_once_with(path=str(save_path), full_page=True)
|
||||
|
||||
|
||||
@patch("m3docvqa.downloader._download_wiki_page")
|
||||
def test_download_wiki_page_batch(mock_download_wiki_page, tmp_path, test_urls_and_paths):
|
||||
"""Test batch downloading multiple Wikipedia pages."""
|
||||
urls, save_paths = test_urls_and_paths
|
||||
result_jsonl_path = tmp_path / "download_results.jsonl"
|
||||
|
||||
# Mock individual downloads to always succeed
|
||||
mock_download_wiki_page.side_effect = [(True, None), (True, None)]
|
||||
|
||||
# Call the function
|
||||
results = download_wiki_page(urls, save_paths, 'pdf', str(result_jsonl_path), proc_id=0, n_proc=1)
|
||||
|
||||
# Assertions
|
||||
assert results == [True, True]
|
||||
assert result_jsonl_path.exists()
|
||||
|
||||
# Check JSONL log entries
|
||||
with jsonlines.open(result_jsonl_path, 'r') as reader:
|
||||
log_entries = list(reader)
|
||||
assert len(log_entries) == 2
|
||||
assert log_entries[0]['downloaded'] is True
|
||||
assert log_entries[0]['error'] is None
|
||||
assert log_entries[1]['downloaded'] is True
|
||||
assert log_entries[1]['error'] is None
|
86
m3docvqa/tests/test_pdf_utils.py
Normal file
86
m3docvqa/tests/test_pdf_utils.py
Normal file
@ -0,0 +1,86 @@
|
||||
# Copyright 2024 Bloomberg Finance L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
from m3docvqa.pdf_utils import is_pdf_downloaded, is_pdf_clean, get_images_from_pdf
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
from reportlab.pdfgen import canvas # For creating sample PDFs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_pdf(tmp_path) -> Path:
|
||||
"""Create a temporary sample PDF file for testing."""
|
||||
pdf_path = tmp_path / "sample.pdf"
|
||||
c = canvas.Canvas(str(pdf_path))
|
||||
c.drawString(100, 100, "Sample PDF text for testing.") # Add sample text to the PDF
|
||||
c.save()
|
||||
return pdf_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def corrupted_pdf(tmp_path) -> Path:
|
||||
"""Create a temporary, corrupted PDF file for testing."""
|
||||
pdf_path = tmp_path / "corrupted.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.4 corrupted content") # Write incomplete/corrupted PDF content
|
||||
return pdf_path
|
||||
|
||||
|
||||
def test_is_pdf_downloaded_existing_pdf(sample_pdf):
|
||||
"""Test is_pdf_downloaded on a valid, existing PDF."""
|
||||
assert is_pdf_downloaded(str(sample_pdf)) is True, "Expected PDF to be recognized as downloaded."
|
||||
|
||||
|
||||
def test_is_pdf_downloaded_nonexistent_pdf(tmp_path):
|
||||
"""Test is_pdf_downloaded on a non-existent PDF file."""
|
||||
non_existent_pdf = tmp_path / "non_existent.pdf"
|
||||
assert is_pdf_downloaded(str(non_existent_pdf)) is False, "Expected non-existent PDF to be marked as not downloaded."
|
||||
|
||||
|
||||
def test_is_pdf_clean_valid_pdf(sample_pdf):
|
||||
"""Test is_pdf_clean on a valid, clean PDF."""
|
||||
assert is_pdf_clean(str(sample_pdf)) is True, "Expected PDF to be recognized as clean."
|
||||
|
||||
|
||||
def test_is_pdf_clean_corrupted_pdf(corrupted_pdf):
|
||||
"""Test is_pdf_clean on a corrupted PDF."""
|
||||
assert is_pdf_clean(str(corrupted_pdf)) is False, "Expected corrupted PDF to be marked as not clean."
|
||||
|
||||
|
||||
def test_get_images_from_pdf_extract_images(sample_pdf, tmp_path):
|
||||
"""Test get_images_from_pdf to ensure it extracts images correctly."""
|
||||
image_dir = tmp_path / "images"
|
||||
images = get_images_from_pdf(str(sample_pdf), save_dir=str(image_dir), dpi_resolution=72, save_type='png')
|
||||
|
||||
# Verify that at least one image was extracted
|
||||
assert len(images) > 0, "Expected at least one image to be extracted from the PDF."
|
||||
|
||||
# Verify that images were saved to the directory
|
||||
saved_images = list(image_dir.glob("*.png"))
|
||||
assert len(saved_images) == len(images), "Expected number of saved images to match the number of extracted images."
|
||||
|
||||
# Verify that the saved image files exist and are valid
|
||||
for image_path in saved_images:
|
||||
with Image.open(image_path) as img:
|
||||
assert img.format == "PNG", "Expected saved image to be in PNG format."
|
||||
|
||||
|
||||
def test_get_images_from_pdf_no_save_dir(sample_pdf):
|
||||
"""Test get_images_from_pdf without saving images, only returning them as a list."""
|
||||
images = get_images_from_pdf(str(sample_pdf), save_dir=None, dpi_resolution=72)
|
||||
assert len(images) > 0, "Expected at least one image to be returned without saving."
|
||||
assert all(isinstance(image, Image.Image) for image in images), "Expected all returned items to be PIL Image objects."
|
||||
|
107
m3docvqa/tests/test_split_utils.py
Normal file
107
m3docvqa/tests/test_split_utils.py
Normal file
@ -0,0 +1,107 @@
|
||||
# Copyright 2024 Bloomberg Finance L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import json
|
||||
import jsonlines
|
||||
from unittest.mock import MagicMock, patch
|
||||
from m3docvqa.split_utils import create_split_dirs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_pdf_directory(tmp_path):
|
||||
# Create a temporary directory for PDFs
|
||||
pdf_dir = tmp_path / "pdfs"
|
||||
pdf_dir.mkdir()
|
||||
# Add some mock PDF files
|
||||
(pdf_dir / "doc1.pdf").write_text("PDF content for doc1")
|
||||
(pdf_dir / "doc2.pdf").write_text("PDF content for doc2")
|
||||
return pdf_dir
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_metadata_file(tmp_path):
|
||||
# Create a temporary metadata file in JSONL format
|
||||
metadata_file = tmp_path / "MMQA_train.jsonl"
|
||||
data = [
|
||||
{"supporting_context": [{"doc_id": "doc1"}]},
|
||||
{"supporting_context": [{"doc_id": "doc2"}]}
|
||||
]
|
||||
with jsonlines.open(metadata_file, mode='w') as writer:
|
||||
writer.write_all(data)
|
||||
return metadata_file
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_target_directory(tmp_path):
|
||||
return tmp_path / "target"
|
||||
|
||||
|
||||
def test_create_split_dirs(mock_pdf_directory, mock_metadata_file, mock_target_directory):
|
||||
"""Test the create_split_dirs function."""
|
||||
# Prepare the split directory
|
||||
split = "train"
|
||||
|
||||
# Call the function to create split directories
|
||||
create_split_dirs(
|
||||
all_pdf_dir=mock_pdf_directory,
|
||||
target_dir_base=mock_target_directory,
|
||||
split_metadata_file=mock_metadata_file,
|
||||
split=split
|
||||
)
|
||||
|
||||
# Assert that the target directory exists and contains the expected PDF files
|
||||
target_dir = mock_target_directory / f"pdfs_{split}"
|
||||
assert target_dir.exists(), f"Directory {target_dir} was not created"
|
||||
assert (target_dir / "doc1.pdf").exists(), "doc1.pdf was not copied"
|
||||
assert (target_dir / "doc2.pdf").exists(), "doc2.pdf was not copied"
|
||||
|
||||
|
||||
def test_create_split_dirs_missing_pdf(mock_metadata_file, mock_target_directory):
|
||||
"""Test create_split_dirs when PDF files are missing."""
|
||||
# Prepare the split directory
|
||||
split = "train"
|
||||
all_pdf_dir = Path("non_existing_pdf_dir")
|
||||
|
||||
# Call the function and verify that the missing PDFs are handled correctly
|
||||
create_split_dirs(
|
||||
all_pdf_dir=all_pdf_dir,
|
||||
target_dir_base=mock_target_directory,
|
||||
split_metadata_file=mock_metadata_file,
|
||||
split=split
|
||||
)
|
||||
|
||||
target_dir = mock_target_directory / f"pdfs_{split}"
|
||||
assert target_dir.exists(), f"Directory {target_dir} was not created"
|
||||
assert not (target_dir / "doc1.pdf").exists(), "doc1.pdf should not exist"
|
||||
assert not (target_dir / "doc2.pdf").exists(), "doc2.pdf should not exist"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("split, expected_error", [
|
||||
("test_split", ValueError), # Invalid split type
|
||||
(None, ValueError), # Missing split
|
||||
])
|
||||
def test_create_split_dirs_invalid_split_type(mock_pdf_directory, mock_metadata_file, mock_target_directory, split, expected_error):
|
||||
"""Test invalid split types in create_split_dirs."""
|
||||
with pytest.raises(expected_error):
|
||||
create_split_dirs(
|
||||
all_pdf_dir=mock_pdf_directory,
|
||||
target_dir_base=mock_target_directory,
|
||||
split_metadata_file=mock_metadata_file,
|
||||
split=split
|
||||
)
|
Reference in New Issue
Block a user