M3DocRAG/m3docvqa/tests/test_split_utils.py

# Copyright 2024 Bloomberg Finance L.P.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# SPDX-License-Identifier: Apache-2.0

import pytest
from pathlib import Path
import shutil
import json
import jsonlines
from unittest.mock import MagicMock, patch
from m3docvqa.split_utils import create_split_dirs


@pytest.fixture
def mock_pdf_directory(tmp_path):
    # Create a temporary directory for PDFs
    pdf_dir = tmp_path / "pdfs"
    pdf_dir.mkdir()
    # Add some mock PDF files
    (pdf_dir / "doc1.pdf").write_text("PDF content for doc1")
    (pdf_dir / "doc2.pdf").write_text("PDF content for doc2")
    return pdf_dir


@pytest.fixture
def mock_metadata_file(tmp_path):
    # Create a temporary metadata file in JSONL format
    metadata_file = tmp_path / "MMQA_train.jsonl"
    data = [
        {"supporting_context": [{"doc_id": "doc1"}]},
        {"supporting_context": [{"doc_id": "doc2"}]}
    ]
    with jsonlines.open(metadata_file, mode='w') as writer:
        writer.write_all(data)
    return metadata_file


@pytest.fixture
def mock_target_directory(tmp_path):
    return tmp_path / "target"


def test_create_split_dirs(mock_pdf_directory, mock_metadata_file, mock_target_directory):
    """Test the create_split_dirs function."""
    # Prepare the split directory
    split = "train"

    # Call the function to create split directories
    create_split_dirs(
        all_pdf_dir=mock_pdf_directory,
        target_dir_base=mock_target_directory,
        split_metadata_file=mock_metadata_file,
        split=split
    )

    # Assert that the target directory exists and contains the expected PDF files
    target_dir = mock_target_directory / f"pdfs_{split}"
    assert target_dir.exists(), f"Directory {target_dir} was not created"
    assert (target_dir / "doc1.pdf").exists(), "doc1.pdf was not copied"
    assert (target_dir / "doc2.pdf").exists(), "doc2.pdf was not copied"


def test_create_split_dirs_missing_pdf(mock_metadata_file, mock_target_directory):
    """Test create_split_dirs when PDF files are missing."""
    # Prepare the split directory
    split = "train"
    all_pdf_dir = Path("non_existing_pdf_dir")

    # Call the function and verify that the missing PDFs are handled correctly
    create_split_dirs(
        all_pdf_dir=all_pdf_dir,
        target_dir_base=mock_target_directory,
        split_metadata_file=mock_metadata_file,
        split=split
    )

    target_dir = mock_target_directory / f"pdfs_{split}"
    assert target_dir.exists(), f"Directory {target_dir} was not created"
    assert not (target_dir / "doc1.pdf").exists(), "doc1.pdf should not exist"
    assert not (target_dir / "doc2.pdf").exists(), "doc2.pdf should not exist"


@pytest.mark.parametrize("split, expected_error", [
    ("test_split", ValueError),  # Invalid split type
    (None, ValueError),  # Missing split
])
def test_create_split_dirs_invalid_split_type(mock_pdf_directory, mock_metadata_file, mock_target_directory, split, expected_error):
    """Test invalid split types in create_split_dirs."""
    with pytest.raises(expected_error):
        create_split_dirs(
            all_pdf_dir=mock_pdf_directory,
            target_dir_base=mock_target_directory,
            split_metadata_file=mock_metadata_file,
            split=split
        )