Release commit

Signed-off-by: Stephen Augustus <saugustus2@bloomberg.net>
This commit is contained in:
j-min
2025-01-30 17:04:56 -05:00
committed by oir
parent e04aeadfb0
commit 27aac8d521
50 changed files with 5692 additions and 0 deletions

View File

@ -0,0 +1,105 @@
# Copyright 2024 Bloomberg Finance L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from unittest.mock import patch, MagicMock
from pathlib import Path
import jsonlines
from m3docvqa.downloader import _download_wiki_page, download_wiki_page
@pytest.fixture
def test_urls_and_paths(tmp_path):
"""Fixture to provide sample URLs and save paths for testing."""
urls = ["https://en.wikipedia.org/wiki/SamplePage1", "https://en.wikipedia.org/wiki/SamplePage2"]
save_paths = [str(tmp_path / "sample1.pdf"), str(tmp_path / "sample2.pdf")]
return urls, save_paths
@patch("m3docvqa.downloader.sync_playwright")
def test__download_wiki_page_pdf(mock_playwright, tmp_path):
"""Test downloading a single page as a PDF."""
url = "https://en.wikipedia.org/wiki/SamplePage"
save_path = tmp_path / "sample.pdf"
args = (0, 1, url, str(save_path), 'pdf', 0)
# Mock Playwright behavior
mock_browser = MagicMock()
mock_context = MagicMock()
mock_page = MagicMock()
mock_playwright.return_value.__enter__.return_value.chromium.launch.return_value = mock_browser
mock_browser.new_context.return_value = mock_context
mock_context.new_page.return_value = mock_page
# Call the function
downloaded, error = _download_wiki_page(args)
# Assertions
assert downloaded is True
assert error is None
mock_page.goto.assert_called_once_with(url)
mock_page.pdf.assert_called_once_with(path=str(save_path))
@patch("m3docvqa.downloader.sync_playwright")
def test__download_wiki_page_png(mock_playwright, tmp_path):
"""Test downloading a single page as a PNG."""
url = "https://en.wikipedia.org/wiki/SamplePage"
save_path = tmp_path / "sample.png"
args = (0, 1, url, str(save_path), 'png', 0)
# Mock Playwright behavior
mock_browser = MagicMock()
mock_context = MagicMock()
mock_page = MagicMock()
mock_playwright.return_value.__enter__.return_value.chromium.launch.return_value = mock_browser
mock_browser.new_context.return_value = mock_context
mock_context.new_page.return_value = mock_page
# Call the function
downloaded, error = _download_wiki_page(args)
# Assertions
assert downloaded is True
assert error is None
mock_page.goto.assert_called_once_with(url)
mock_page.screenshot.assert_called_once_with(path=str(save_path), full_page=True)
@patch("m3docvqa.downloader._download_wiki_page")
def test_download_wiki_page_batch(mock_download_wiki_page, tmp_path, test_urls_and_paths):
"""Test batch downloading multiple Wikipedia pages."""
urls, save_paths = test_urls_and_paths
result_jsonl_path = tmp_path / "download_results.jsonl"
# Mock individual downloads to always succeed
mock_download_wiki_page.side_effect = [(True, None), (True, None)]
# Call the function
results = download_wiki_page(urls, save_paths, 'pdf', str(result_jsonl_path), proc_id=0, n_proc=1)
# Assertions
assert results == [True, True]
assert result_jsonl_path.exists()
# Check JSONL log entries
with jsonlines.open(result_jsonl_path, 'r') as reader:
log_entries = list(reader)
assert len(log_entries) == 2
assert log_entries[0]['downloaded'] is True
assert log_entries[0]['error'] is None
assert log_entries[1]['downloaded'] is True
assert log_entries[1]['error'] is None

View File

@ -0,0 +1,86 @@
# Copyright 2024 Bloomberg Finance L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from m3docvqa.pdf_utils import is_pdf_downloaded, is_pdf_clean, get_images_from_pdf
from pathlib import Path
from PIL import Image
from reportlab.pdfgen import canvas # For creating sample PDFs
@pytest.fixture
def sample_pdf(tmp_path) -> Path:
"""Create a temporary sample PDF file for testing."""
pdf_path = tmp_path / "sample.pdf"
c = canvas.Canvas(str(pdf_path))
c.drawString(100, 100, "Sample PDF text for testing.") # Add sample text to the PDF
c.save()
return pdf_path
@pytest.fixture
def corrupted_pdf(tmp_path) -> Path:
"""Create a temporary, corrupted PDF file for testing."""
pdf_path = tmp_path / "corrupted.pdf"
pdf_path.write_bytes(b"%PDF-1.4 corrupted content") # Write incomplete/corrupted PDF content
return pdf_path
def test_is_pdf_downloaded_existing_pdf(sample_pdf):
"""Test is_pdf_downloaded on a valid, existing PDF."""
assert is_pdf_downloaded(str(sample_pdf)) is True, "Expected PDF to be recognized as downloaded."
def test_is_pdf_downloaded_nonexistent_pdf(tmp_path):
"""Test is_pdf_downloaded on a non-existent PDF file."""
non_existent_pdf = tmp_path / "non_existent.pdf"
assert is_pdf_downloaded(str(non_existent_pdf)) is False, "Expected non-existent PDF to be marked as not downloaded."
def test_is_pdf_clean_valid_pdf(sample_pdf):
"""Test is_pdf_clean on a valid, clean PDF."""
assert is_pdf_clean(str(sample_pdf)) is True, "Expected PDF to be recognized as clean."
def test_is_pdf_clean_corrupted_pdf(corrupted_pdf):
"""Test is_pdf_clean on a corrupted PDF."""
assert is_pdf_clean(str(corrupted_pdf)) is False, "Expected corrupted PDF to be marked as not clean."
def test_get_images_from_pdf_extract_images(sample_pdf, tmp_path):
"""Test get_images_from_pdf to ensure it extracts images correctly."""
image_dir = tmp_path / "images"
images = get_images_from_pdf(str(sample_pdf), save_dir=str(image_dir), dpi_resolution=72, save_type='png')
# Verify that at least one image was extracted
assert len(images) > 0, "Expected at least one image to be extracted from the PDF."
# Verify that images were saved to the directory
saved_images = list(image_dir.glob("*.png"))
assert len(saved_images) == len(images), "Expected number of saved images to match the number of extracted images."
# Verify that the saved image files exist and are valid
for image_path in saved_images:
with Image.open(image_path) as img:
assert img.format == "PNG", "Expected saved image to be in PNG format."
def test_get_images_from_pdf_no_save_dir(sample_pdf):
"""Test get_images_from_pdf without saving images, only returning them as a list."""
images = get_images_from_pdf(str(sample_pdf), save_dir=None, dpi_resolution=72)
assert len(images) > 0, "Expected at least one image to be returned without saving."
assert all(isinstance(image, Image.Image) for image in images), "Expected all returned items to be PIL Image objects."

View File

@ -0,0 +1,107 @@
# Copyright 2024 Bloomberg Finance L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from pathlib import Path
import shutil
import json
import jsonlines
from unittest.mock import MagicMock, patch
from m3docvqa.split_utils import create_split_dirs
@pytest.fixture
def mock_pdf_directory(tmp_path):
# Create a temporary directory for PDFs
pdf_dir = tmp_path / "pdfs"
pdf_dir.mkdir()
# Add some mock PDF files
(pdf_dir / "doc1.pdf").write_text("PDF content for doc1")
(pdf_dir / "doc2.pdf").write_text("PDF content for doc2")
return pdf_dir
@pytest.fixture
def mock_metadata_file(tmp_path):
# Create a temporary metadata file in JSONL format
metadata_file = tmp_path / "MMQA_train.jsonl"
data = [
{"supporting_context": [{"doc_id": "doc1"}]},
{"supporting_context": [{"doc_id": "doc2"}]}
]
with jsonlines.open(metadata_file, mode='w') as writer:
writer.write_all(data)
return metadata_file
@pytest.fixture
def mock_target_directory(tmp_path):
return tmp_path / "target"
def test_create_split_dirs(mock_pdf_directory, mock_metadata_file, mock_target_directory):
"""Test the create_split_dirs function."""
# Prepare the split directory
split = "train"
# Call the function to create split directories
create_split_dirs(
all_pdf_dir=mock_pdf_directory,
target_dir_base=mock_target_directory,
split_metadata_file=mock_metadata_file,
split=split
)
# Assert that the target directory exists and contains the expected PDF files
target_dir = mock_target_directory / f"pdfs_{split}"
assert target_dir.exists(), f"Directory {target_dir} was not created"
assert (target_dir / "doc1.pdf").exists(), "doc1.pdf was not copied"
assert (target_dir / "doc2.pdf").exists(), "doc2.pdf was not copied"
def test_create_split_dirs_missing_pdf(mock_metadata_file, mock_target_directory):
"""Test create_split_dirs when PDF files are missing."""
# Prepare the split directory
split = "train"
all_pdf_dir = Path("non_existing_pdf_dir")
# Call the function and verify that the missing PDFs are handled correctly
create_split_dirs(
all_pdf_dir=all_pdf_dir,
target_dir_base=mock_target_directory,
split_metadata_file=mock_metadata_file,
split=split
)
target_dir = mock_target_directory / f"pdfs_{split}"
assert target_dir.exists(), f"Directory {target_dir} was not created"
assert not (target_dir / "doc1.pdf").exists(), "doc1.pdf should not exist"
assert not (target_dir / "doc2.pdf").exists(), "doc2.pdf should not exist"
@pytest.mark.parametrize("split, expected_error", [
("test_split", ValueError), # Invalid split type
(None, ValueError), # Missing split
])
def test_create_split_dirs_invalid_split_type(mock_pdf_directory, mock_metadata_file, mock_target_directory, split, expected_error):
"""Test invalid split types in create_split_dirs."""
with pytest.raises(expected_error):
create_split_dirs(
all_pdf_dir=mock_pdf_directory,
target_dir_base=mock_target_directory,
split_metadata_file=mock_metadata_file,
split=split
)