Skip to content

Commit

Permalink
Merge pull request #292 from Modalities/file_existence_policy_for_ind…
Browse files Browse the repository at this point in the history
…exation

File existence policy for indexation
  • Loading branch information
le1nux authored Jan 16, 2025
2 parents 13f1a26 + 551b8f5 commit 2673e1c
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 6 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
rev: 23.9.1
hooks:
- id: black
language_version: python3.10
language_version: python3.11
stages: [pre-commit]
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.278
Expand Down
31 changes: 26 additions & 5 deletions src/modalities/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python

import os
from enum import Enum
from pathlib import Path

from pydantic import FilePath
Expand All @@ -15,9 +16,18 @@
from modalities.models.huggingface_adapters.hf_adapter import HFModelAdapter
from modalities.registry.components import COMPONENTS
from modalities.registry.registry import Registry
from modalities.utils.logging import get_logger


def create_raw_data_index(src_path: Path, index_path: Path):
class FileExistencePolicy(Enum):
SKIP = "skip"
ERROR = "error"
OVERRIDE = "override"


def create_raw_data_index(
src_path: Path, index_path: Path, file_existence_policy: FileExistencePolicy = FileExistencePolicy.ERROR
):
"""Creates the index file for the content of a large jsonl-file. The index file
contains the byte-offsets and lengths of each line in the jsonl-file.
Background is the ability to further process the respective file without loading it,
Expand All @@ -32,12 +42,23 @@ def create_raw_data_index(src_path: Path, index_path: Path):
ValueError: If the index file already exists.
"""
index_path = LargeFileLinesReader.default_index_path(src_path, index_path)
os.makedirs(index_path.parent, exist_ok=True)
if index_path.exists():
raise ValueError("index already exists. delete it or specify different output folder.")
if file_existence_policy == FileExistencePolicy.SKIP:
get_logger(name="main").warning(f"Index already exists at {str(index_path)}. Skipping index creation.")
return
elif file_existence_policy == FileExistencePolicy.OVERRIDE:
get_logger(name="main").warning(f"Index already exists at {str(index_path)}. Overriding it.")
os.remove(index_path)
elif file_existence_policy == FileExistencePolicy.ERROR:
raise ValueError("index already exists. delete it or specify different output folder.")
else:
raise ValueError(f"Unknown file existence policy: {file_existence_policy}")

get_logger(name="main").info(
f"Reading raw data from {str(src_path)} and" f" writing index to {str(index_path)} ..."
)
os.makedirs(index_path.parent, exist_ok=True)

print(f"reading raw data from {src_path}")
print(f"writing index to {index_path}")
generator = IndexGenerator(src_path)
generator.create_index(index_path)

Expand Down
11 changes: 11 additions & 0 deletions src/modalities/utils/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import logging


def get_logger(name: str = "main") -> logging.Logger:
logger = logging.getLogger(name)
if not logger.handlers:
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("%(name)s - %(levelname)s - %(message)s"))
logger.addHandler(handler)
return logger

0 comments on commit 2673e1c

Please sign in to comment.