Release commit
Signed-off-by: Stephen Augustus <saugustus2@bloomberg.net>
This commit is contained in:
86
m3docvqa/tests/test_pdf_utils.py
Normal file
86
m3docvqa/tests/test_pdf_utils.py
Normal file
@ -0,0 +1,86 @@
|
||||
# Copyright 2024 Bloomberg Finance L.P.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
|
||||
import pytest
|
||||
from m3docvqa.pdf_utils import is_pdf_downloaded, is_pdf_clean, get_images_from_pdf
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
from reportlab.pdfgen import canvas # For creating sample PDFs
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sample_pdf(tmp_path) -> Path:
|
||||
"""Create a temporary sample PDF file for testing."""
|
||||
pdf_path = tmp_path / "sample.pdf"
|
||||
c = canvas.Canvas(str(pdf_path))
|
||||
c.drawString(100, 100, "Sample PDF text for testing.") # Add sample text to the PDF
|
||||
c.save()
|
||||
return pdf_path
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def corrupted_pdf(tmp_path) -> Path:
|
||||
"""Create a temporary, corrupted PDF file for testing."""
|
||||
pdf_path = tmp_path / "corrupted.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.4 corrupted content") # Write incomplete/corrupted PDF content
|
||||
return pdf_path
|
||||
|
||||
|
||||
def test_is_pdf_downloaded_existing_pdf(sample_pdf):
|
||||
"""Test is_pdf_downloaded on a valid, existing PDF."""
|
||||
assert is_pdf_downloaded(str(sample_pdf)) is True, "Expected PDF to be recognized as downloaded."
|
||||
|
||||
|
||||
def test_is_pdf_downloaded_nonexistent_pdf(tmp_path):
|
||||
"""Test is_pdf_downloaded on a non-existent PDF file."""
|
||||
non_existent_pdf = tmp_path / "non_existent.pdf"
|
||||
assert is_pdf_downloaded(str(non_existent_pdf)) is False, "Expected non-existent PDF to be marked as not downloaded."
|
||||
|
||||
|
||||
def test_is_pdf_clean_valid_pdf(sample_pdf):
|
||||
"""Test is_pdf_clean on a valid, clean PDF."""
|
||||
assert is_pdf_clean(str(sample_pdf)) is True, "Expected PDF to be recognized as clean."
|
||||
|
||||
|
||||
def test_is_pdf_clean_corrupted_pdf(corrupted_pdf):
|
||||
"""Test is_pdf_clean on a corrupted PDF."""
|
||||
assert is_pdf_clean(str(corrupted_pdf)) is False, "Expected corrupted PDF to be marked as not clean."
|
||||
|
||||
|
||||
def test_get_images_from_pdf_extract_images(sample_pdf, tmp_path):
|
||||
"""Test get_images_from_pdf to ensure it extracts images correctly."""
|
||||
image_dir = tmp_path / "images"
|
||||
images = get_images_from_pdf(str(sample_pdf), save_dir=str(image_dir), dpi_resolution=72, save_type='png')
|
||||
|
||||
# Verify that at least one image was extracted
|
||||
assert len(images) > 0, "Expected at least one image to be extracted from the PDF."
|
||||
|
||||
# Verify that images were saved to the directory
|
||||
saved_images = list(image_dir.glob("*.png"))
|
||||
assert len(saved_images) == len(images), "Expected number of saved images to match the number of extracted images."
|
||||
|
||||
# Verify that the saved image files exist and are valid
|
||||
for image_path in saved_images:
|
||||
with Image.open(image_path) as img:
|
||||
assert img.format == "PNG", "Expected saved image to be in PNG format."
|
||||
|
||||
|
||||
def test_get_images_from_pdf_no_save_dir(sample_pdf):
|
||||
"""Test get_images_from_pdf without saving images, only returning them as a list."""
|
||||
images = get_images_from_pdf(str(sample_pdf), save_dir=None, dpi_resolution=72)
|
||||
assert len(images) > 0, "Expected at least one image to be returned without saving."
|
||||
assert all(isinstance(image, Image.Image) for image in images), "Expected all returned items to be PIL Image objects."
|
||||
|
Reference in New Issue
Block a user