M3DocRAG/m3docvqa/tests/test_pdf_utils.py

# Copyright 2024 Bloomberg Finance L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0

import pytest
from m3docvqa.pdf_utils import is_pdf_downloaded, is_pdf_clean, get_images_from_pdf
from pathlib import Path
from PIL import Image
from reportlab.pdfgen import canvas  # For creating sample PDFs


@pytest.fixture
def sample_pdf(tmp_path) -> Path:
    """Create a temporary sample PDF file for testing."""
    pdf_path = tmp_path / "sample.pdf"
    c = canvas.Canvas(str(pdf_path))
    c.drawString(100, 100, "Sample PDF text for testing.")  # Add sample text to the PDF
    c.save()
    return pdf_path


@pytest.fixture
def corrupted_pdf(tmp_path) -> Path:
    """Create a temporary, corrupted PDF file for testing."""
    pdf_path = tmp_path / "corrupted.pdf"
    pdf_path.write_bytes(b"%PDF-1.4 corrupted content")  # Write incomplete/corrupted PDF content
    return pdf_path


def test_is_pdf_downloaded_existing_pdf(sample_pdf):
    """Test is_pdf_downloaded on a valid, existing PDF."""
    assert is_pdf_downloaded(str(sample_pdf)) is True, "Expected PDF to be recognized as downloaded."


def test_is_pdf_downloaded_nonexistent_pdf(tmp_path):
    """Test is_pdf_downloaded on a non-existent PDF file."""
    non_existent_pdf = tmp_path / "non_existent.pdf"
    assert is_pdf_downloaded(str(non_existent_pdf)) is False, "Expected non-existent PDF to be marked as not downloaded."


def test_is_pdf_clean_valid_pdf(sample_pdf):
    """Test is_pdf_clean on a valid, clean PDF."""
    assert is_pdf_clean(str(sample_pdf)) is True, "Expected PDF to be recognized as clean."


def test_is_pdf_clean_corrupted_pdf(corrupted_pdf):
    """Test is_pdf_clean on a corrupted PDF."""
    assert is_pdf_clean(str(corrupted_pdf)) is False, "Expected corrupted PDF to be marked as not clean."


def test_get_images_from_pdf_extract_images(sample_pdf, tmp_path):
    """Test get_images_from_pdf to ensure it extracts images correctly."""
    image_dir = tmp_path / "images"
    images = get_images_from_pdf(str(sample_pdf), save_dir=str(image_dir), dpi_resolution=72, save_type='png')

    # Verify that at least one image was extracted
    assert len(images) > 0, "Expected at least one image to be extracted from the PDF."

    # Verify that images were saved to the directory
    saved_images = list(image_dir.glob("*.png"))
    assert len(saved_images) == len(images), "Expected number of saved images to match the number of extracted images."

    # Verify that the saved image files exist and are valid
    for image_path in saved_images:
        with Image.open(image_path) as img:
            assert img.format == "PNG", "Expected saved image to be in PNG format."


def test_get_images_from_pdf_no_save_dir(sample_pdf):
    """Test get_images_from_pdf without saving images, only returning them as a list."""
    images = get_images_from_pdf(str(sample_pdf), save_dir=None, dpi_resolution=72)
    assert len(images) > 0, "Expected at least one image to be returned without saving."
    assert all(isinstance(image, Image.Image) for image in images), "Expected all returned items to be PIL Image objects."