Files
M3DocRAG/m3docvqa/tests/test_pdf_utils.py
j-min 27aac8d521 Release commit
Signed-off-by: Stephen Augustus <saugustus2@bloomberg.net>
2025-02-15 09:52:51 -05:00

87 lines
3.6 KiB
Python

# Copyright 2024 Bloomberg Finance L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0
import pytest
from m3docvqa.pdf_utils import is_pdf_downloaded, is_pdf_clean, get_images_from_pdf
from pathlib import Path
from PIL import Image
from reportlab.pdfgen import canvas # For creating sample PDFs
@pytest.fixture
def sample_pdf(tmp_path) -> Path:
"""Create a temporary sample PDF file for testing."""
pdf_path = tmp_path / "sample.pdf"
c = canvas.Canvas(str(pdf_path))
c.drawString(100, 100, "Sample PDF text for testing.") # Add sample text to the PDF
c.save()
return pdf_path
@pytest.fixture
def corrupted_pdf(tmp_path) -> Path:
"""Create a temporary, corrupted PDF file for testing."""
pdf_path = tmp_path / "corrupted.pdf"
pdf_path.write_bytes(b"%PDF-1.4 corrupted content") # Write incomplete/corrupted PDF content
return pdf_path
def test_is_pdf_downloaded_existing_pdf(sample_pdf):
"""Test is_pdf_downloaded on a valid, existing PDF."""
assert is_pdf_downloaded(str(sample_pdf)) is True, "Expected PDF to be recognized as downloaded."
def test_is_pdf_downloaded_nonexistent_pdf(tmp_path):
"""Test is_pdf_downloaded on a non-existent PDF file."""
non_existent_pdf = tmp_path / "non_existent.pdf"
assert is_pdf_downloaded(str(non_existent_pdf)) is False, "Expected non-existent PDF to be marked as not downloaded."
def test_is_pdf_clean_valid_pdf(sample_pdf):
"""Test is_pdf_clean on a valid, clean PDF."""
assert is_pdf_clean(str(sample_pdf)) is True, "Expected PDF to be recognized as clean."
def test_is_pdf_clean_corrupted_pdf(corrupted_pdf):
"""Test is_pdf_clean on a corrupted PDF."""
assert is_pdf_clean(str(corrupted_pdf)) is False, "Expected corrupted PDF to be marked as not clean."
def test_get_images_from_pdf_extract_images(sample_pdf, tmp_path):
"""Test get_images_from_pdf to ensure it extracts images correctly."""
image_dir = tmp_path / "images"
images = get_images_from_pdf(str(sample_pdf), save_dir=str(image_dir), dpi_resolution=72, save_type='png')
# Verify that at least one image was extracted
assert len(images) > 0, "Expected at least one image to be extracted from the PDF."
# Verify that images were saved to the directory
saved_images = list(image_dir.glob("*.png"))
assert len(saved_images) == len(images), "Expected number of saved images to match the number of extracted images."
# Verify that the saved image files exist and are valid
for image_path in saved_images:
with Image.open(image_path) as img:
assert img.format == "PNG", "Expected saved image to be in PNG format."
def test_get_images_from_pdf_no_save_dir(sample_pdf):
"""Test get_images_from_pdf without saving images, only returning them as a list."""
images = get_images_from_pdf(str(sample_pdf), save_dir=None, dpi_resolution=72)
assert len(images) > 0, "Expected at least one image to be returned without saving."
assert all(isinstance(image, Image.Image) for image in images), "Expected all returned items to be PIL Image objects."