Release commit

Signed-off-by: Stephen Augustus <saugustus2@bloomberg.net>
2025-01-30 17:04:56 -05:00
parent e04aeadfb0
commit 27aac8d521
50 changed files with 5692 additions and 0 deletions
--- a/m3docvqa/tests/test_downloader.py
+++ b/m3docvqa/tests/test_downloader.py
@@ -0,0 +1,105 @@
+# Copyright 2024 Bloomberg Finance L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from unittest.mock import patch, MagicMock
+from pathlib import Path
+import jsonlines
+from m3docvqa.downloader import _download_wiki_page, download_wiki_page
+
+
+@pytest.fixture
+def test_urls_and_paths(tmp_path):
+    """Fixture to provide sample URLs and save paths for testing."""
+    urls = ["https://en.wikipedia.org/wiki/SamplePage1", "https://en.wikipedia.org/wiki/SamplePage2"]
+    save_paths = [str(tmp_path / "sample1.pdf"), str(tmp_path / "sample2.pdf")]
+    return urls, save_paths
+
+
+@patch("m3docvqa.downloader.sync_playwright")
+def test__download_wiki_page_pdf(mock_playwright, tmp_path):
+    """Test downloading a single page as a PDF."""
+    url = "https://en.wikipedia.org/wiki/SamplePage"
+    save_path = tmp_path / "sample.pdf"
+    args = (0, 1, url, str(save_path), 'pdf', 0)
+
+    # Mock Playwright behavior
+    mock_browser = MagicMock()
+    mock_context = MagicMock()
+    mock_page = MagicMock()
+    mock_playwright.return_value.__enter__.return_value.chromium.launch.return_value = mock_browser
+    mock_browser.new_context.return_value = mock_context
+    mock_context.new_page.return_value = mock_page
+
+    # Call the function
+    downloaded, error = _download_wiki_page(args)
+
+    # Assertions
+    assert downloaded is True
+    assert error is None
+    mock_page.goto.assert_called_once_with(url)
+    mock_page.pdf.assert_called_once_with(path=str(save_path))
+
+
+@patch("m3docvqa.downloader.sync_playwright")
+def test__download_wiki_page_png(mock_playwright, tmp_path):
+    """Test downloading a single page as a PNG."""
+    url = "https://en.wikipedia.org/wiki/SamplePage"
+    save_path = tmp_path / "sample.png"
+    args = (0, 1, url, str(save_path), 'png', 0)
+
+    # Mock Playwright behavior
+    mock_browser = MagicMock()
+    mock_context = MagicMock()
+    mock_page = MagicMock()
+    mock_playwright.return_value.__enter__.return_value.chromium.launch.return_value = mock_browser
+    mock_browser.new_context.return_value = mock_context
+    mock_context.new_page.return_value = mock_page
+
+    # Call the function
+    downloaded, error = _download_wiki_page(args)
+
+    # Assertions
+    assert downloaded is True
+    assert error is None
+    mock_page.goto.assert_called_once_with(url)
+    mock_page.screenshot.assert_called_once_with(path=str(save_path), full_page=True)
+
+
+@patch("m3docvqa.downloader._download_wiki_page")
+def test_download_wiki_page_batch(mock_download_wiki_page, tmp_path, test_urls_and_paths):
+    """Test batch downloading multiple Wikipedia pages."""
+    urls, save_paths = test_urls_and_paths
+    result_jsonl_path = tmp_path / "download_results.jsonl"
+
+    # Mock individual downloads to always succeed
+    mock_download_wiki_page.side_effect = [(True, None), (True, None)]
+
+    # Call the function
+    results = download_wiki_page(urls, save_paths, 'pdf', str(result_jsonl_path), proc_id=0, n_proc=1)
+
+    # Assertions
+    assert results == [True, True]
+    assert result_jsonl_path.exists()
+
+    # Check JSONL log entries
+    with jsonlines.open(result_jsonl_path, 'r') as reader:
+        log_entries = list(reader)
+        assert len(log_entries) == 2
+        assert log_entries[0]['downloaded'] is True
+        assert log_entries[0]['error'] is None
+        assert log_entries[1]['downloaded'] is True
+        assert log_entries[1]['error'] is None
--- a/m3docvqa/tests/test_pdf_utils.py
+++ b/m3docvqa/tests/test_pdf_utils.py
@@ -0,0 +1,86 @@
+# Copyright 2024 Bloomberg Finance L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from m3docvqa.pdf_utils import is_pdf_downloaded, is_pdf_clean, get_images_from_pdf
+from pathlib import Path
+from PIL import Image
+from reportlab.pdfgen import canvas  # For creating sample PDFs
+
+
+@pytest.fixture
+def sample_pdf(tmp_path) -> Path:
+    """Create a temporary sample PDF file for testing."""
+    pdf_path = tmp_path / "sample.pdf"
+    c = canvas.Canvas(str(pdf_path))
+    c.drawString(100, 100, "Sample PDF text for testing.")  # Add sample text to the PDF
+    c.save()
+    return pdf_path
+
+
+@pytest.fixture
+def corrupted_pdf(tmp_path) -> Path:
+    """Create a temporary, corrupted PDF file for testing."""
+    pdf_path = tmp_path / "corrupted.pdf"
+    pdf_path.write_bytes(b"%PDF-1.4 corrupted content")  # Write incomplete/corrupted PDF content
+    return pdf_path
+
+
+def test_is_pdf_downloaded_existing_pdf(sample_pdf):
+    """Test is_pdf_downloaded on a valid, existing PDF."""
+    assert is_pdf_downloaded(str(sample_pdf)) is True, "Expected PDF to be recognized as downloaded."
+
+
+def test_is_pdf_downloaded_nonexistent_pdf(tmp_path):
+    """Test is_pdf_downloaded on a non-existent PDF file."""
+    non_existent_pdf = tmp_path / "non_existent.pdf"
+    assert is_pdf_downloaded(str(non_existent_pdf)) is False, "Expected non-existent PDF to be marked as not downloaded."
+
+
+def test_is_pdf_clean_valid_pdf(sample_pdf):
+    """Test is_pdf_clean on a valid, clean PDF."""
+    assert is_pdf_clean(str(sample_pdf)) is True, "Expected PDF to be recognized as clean."
+
+
+def test_is_pdf_clean_corrupted_pdf(corrupted_pdf):
+    """Test is_pdf_clean on a corrupted PDF."""
+    assert is_pdf_clean(str(corrupted_pdf)) is False, "Expected corrupted PDF to be marked as not clean."
+
+
+def test_get_images_from_pdf_extract_images(sample_pdf, tmp_path):
+    """Test get_images_from_pdf to ensure it extracts images correctly."""
+    image_dir = tmp_path / "images"
+    images = get_images_from_pdf(str(sample_pdf), save_dir=str(image_dir), dpi_resolution=72, save_type='png')
+
+    # Verify that at least one image was extracted
+    assert len(images) > 0, "Expected at least one image to be extracted from the PDF."
+
+    # Verify that images were saved to the directory
+    saved_images = list(image_dir.glob("*.png"))
+    assert len(saved_images) == len(images), "Expected number of saved images to match the number of extracted images."
+
+    # Verify that the saved image files exist and are valid
+    for image_path in saved_images:
+        with Image.open(image_path) as img:
+            assert img.format == "PNG", "Expected saved image to be in PNG format."
+
+
+def test_get_images_from_pdf_no_save_dir(sample_pdf):
+    """Test get_images_from_pdf without saving images, only returning them as a list."""
+    images = get_images_from_pdf(str(sample_pdf), save_dir=None, dpi_resolution=72)
+    assert len(images) > 0, "Expected at least one image to be returned without saving."
+    assert all(isinstance(image, Image.Image) for image in images), "Expected all returned items to be PIL Image objects."
+
--- a/m3docvqa/tests/test_split_utils.py
+++ b/m3docvqa/tests/test_split_utils.py
@@ -0,0 +1,107 @@
+# Copyright 2024 Bloomberg Finance L.P.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from pathlib import Path
+import shutil
+import json
+import jsonlines
+from unittest.mock import MagicMock, patch
+from m3docvqa.split_utils import create_split_dirs
+
+
+@pytest.fixture
+def mock_pdf_directory(tmp_path):
+    # Create a temporary directory for PDFs
+    pdf_dir = tmp_path / "pdfs"
+    pdf_dir.mkdir()
+    # Add some mock PDF files
+    (pdf_dir / "doc1.pdf").write_text("PDF content for doc1")
+    (pdf_dir / "doc2.pdf").write_text("PDF content for doc2")
+    return pdf_dir
+
+
+@pytest.fixture
+def mock_metadata_file(tmp_path):
+    # Create a temporary metadata file in JSONL format
+    metadata_file = tmp_path / "MMQA_train.jsonl"
+    data = [
+        {"supporting_context": [{"doc_id": "doc1"}]},
+        {"supporting_context": [{"doc_id": "doc2"}]}
+    ]
+    with jsonlines.open(metadata_file, mode='w') as writer:
+        writer.write_all(data)
+    return metadata_file
+
+
+@pytest.fixture
+def mock_target_directory(tmp_path):
+    return tmp_path / "target"
+
+
+def test_create_split_dirs(mock_pdf_directory, mock_metadata_file, mock_target_directory):
+    """Test the create_split_dirs function."""
+    # Prepare the split directory
+    split = "train"
+    
+    # Call the function to create split directories
+    create_split_dirs(
+        all_pdf_dir=mock_pdf_directory,
+        target_dir_base=mock_target_directory,
+        split_metadata_file=mock_metadata_file,
+        split=split
+    )
+    
+    # Assert that the target directory exists and contains the expected PDF files
+    target_dir = mock_target_directory / f"pdfs_{split}"
+    assert target_dir.exists(), f"Directory {target_dir} was not created"
+    assert (target_dir / "doc1.pdf").exists(), "doc1.pdf was not copied"
+    assert (target_dir / "doc2.pdf").exists(), "doc2.pdf was not copied"
+
+
+def test_create_split_dirs_missing_pdf(mock_metadata_file, mock_target_directory):
+    """Test create_split_dirs when PDF files are missing."""
+    # Prepare the split directory
+    split = "train"
+    all_pdf_dir = Path("non_existing_pdf_dir")
+    
+    # Call the function and verify that the missing PDFs are handled correctly
+    create_split_dirs(
+        all_pdf_dir=all_pdf_dir,
+        target_dir_base=mock_target_directory,
+        split_metadata_file=mock_metadata_file,
+        split=split
+    )
+    
+    target_dir = mock_target_directory / f"pdfs_{split}"
+    assert target_dir.exists(), f"Directory {target_dir} was not created"
+    assert not (target_dir / "doc1.pdf").exists(), "doc1.pdf should not exist"
+    assert not (target_dir / "doc2.pdf").exists(), "doc2.pdf should not exist"
+
+
+@pytest.mark.parametrize("split, expected_error", [
+    ("test_split", ValueError),  # Invalid split type
+    (None, ValueError),  # Missing split
+])
+def test_create_split_dirs_invalid_split_type(mock_pdf_directory, mock_metadata_file, mock_target_directory, split, expected_error):
+    """Test invalid split types in create_split_dirs."""
+    with pytest.raises(expected_error):
+        create_split_dirs(
+            all_pdf_dir=mock_pdf_directory,
+            target_dir_base=mock_target_directory,
+            split_metadata_file=mock_metadata_file,
+            split=split
+        )