From dac52d3da151e0c894dd39e42ef816f9737b3d62 Mon Sep 17 00:00:00 2001 From: Ajeya Bhat Date: Thu, 11 Apr 2024 17:19:21 +0530 Subject: [PATCH 1/3] init --- outpostkit/repository/_loaders/__init__.py | 0 .../_loaders/transformers/__init__.py | 149 +++++++ .../_loaders/transformers/constants.py | 14 + .../_loaders/transformers/download.py | 83 ++++ .../repository/_loaders/transformers/peft.py | 42 ++ .../repository/_loaders/transformers/raw.py | 387 ++++++++++++++++++ .../repository/_loaders/transformers/utils.py | 0 outpostkit/repository/download.py | 5 + 8 files changed, 680 insertions(+) create mode 100644 outpostkit/repository/_loaders/__init__.py create mode 100644 outpostkit/repository/_loaders/transformers/__init__.py create mode 100644 outpostkit/repository/_loaders/transformers/constants.py create mode 100644 outpostkit/repository/_loaders/transformers/download.py create mode 100644 outpostkit/repository/_loaders/transformers/peft.py create mode 100644 outpostkit/repository/_loaders/transformers/raw.py create mode 100644 outpostkit/repository/_loaders/transformers/utils.py create mode 100644 outpostkit/repository/download.py diff --git a/outpostkit/repository/_loaders/__init__.py b/outpostkit/repository/_loaders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/outpostkit/repository/_loaders/transformers/__init__.py b/outpostkit/repository/_loaders/transformers/__init__.py new file mode 100644 index 0000000..19d5eb4 --- /dev/null +++ b/outpostkit/repository/_loaders/transformers/__init__.py @@ -0,0 +1,149 @@ +import copy +import json +import os + +from outpostkit._utils.import_utils import is_peft_available, is_transformers_available +from outpostkit.logger import init_outpost_logger +from outpostkit.repository._loaders.transformers.peft import find_adapter_config_file + +logger = init_outpost_logger(__name__) + +if is_transformers_available: + from transformers import AutoConfig, PretrainedConfig + + +# MODEL_CARD_NAME = "modelcard.json" + + +# ref: https://github.com/huggingface/transformers/blob/a5e5c92aea1e99cb84d7342bd63826ca6cd884c4/src/transformers/models/auto/auto_factory.py#L445 +def setup_model_for_transformers( + full_name_or_dir: str, store_dir: str, *model_args, **kwargs +): + config = kwargs.pop("config", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + kwargs["_from_auto"] = True + + hub_kwargs_names = [ + # "cache_dir", + # "force_download", + # "local_files_only", + # "proxies", + # "resume_download", + "revision", + "subfolder", + # "use_auth_token", + "token", + ] + + hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs} + code_revision = kwargs.pop("code_revision", None) + adapter_kwargs = kwargs.pop("adapter_kwargs", None) + token = hub_kwargs.pop("token", None) + revision = str(kwargs.get("revision")) + if token is not None: + hub_kwargs["token"] = token + # if resolved is None: + # if not isinstance(config, PretrainedConfig): + # # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible + # resolved_config_file = get_file( + # full_name_or_dir=full_name_or_dir, + # repo_type="model", + # file_path=CONFIG_NAME, + # **hub_kwargs, + # ) + # else: + # commit_hash = getattr(config, "_commit_hash", None) + + if is_peft_available(): + if adapter_kwargs is None: + adapter_kwargs = {} + if token is not None: + adapter_kwargs["token"] = token + + maybe_adapter_path = find_adapter_config_file( + full_name_or_dir, + ref=revision, + **adapter_kwargs, + ) + + if maybe_adapter_path is not None: + with open(maybe_adapter_path, encoding="utf-8") as f: + adapter_config = json.load(f) + + adapter_kwargs["_adapter_model_path"] = full_name_or_dir + pretrained_model_name_or_path = adapter_config[ + "base_model_name_or_path" + ] + + if not isinstance(config, PretrainedConfig): + kwargs_orig = copy.deepcopy(kwargs) + # ensure not to pollute the config object with torch_dtype="auto" - since it's + # meaningless in the context of the config object - torch.dtype values are acceptable + if kwargs.get("torch_dtype", None) == "auto": + _ = kwargs.pop("torch_dtype") + # to not overwrite the quantization_config if config has a quantization_config + if kwargs.get("quantization_config", None) is not None: + _ = kwargs.pop("quantization_config") + + config, kwargs = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + return_unused_kwargs=True, + trust_remote_code=trust_remote_code, + code_revision=code_revision, + _commit_hash=commit_hash, + **hub_kwargs, + **kwargs, + ) + + # if torch_dtype=auto was passed here, ensure to pass it on + if kwargs_orig.get("torch_dtype", None) == "auto": + kwargs["torch_dtype"] = "auto" + if kwargs_orig.get("quantization_config", None) is not None: + kwargs["quantization_config"] = kwargs_orig["quantization_config"] + + has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map + has_local_code = type(config) in cls._model_mapping.keys() + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, + pretrained_model_name_or_path, + has_local_code, + has_remote_code, + ) + + # Set the adapter kwargs + kwargs["adapter_kwargs"] = adapter_kwargs + + if has_remote_code and trust_remote_code: + class_ref = config.auto_map[cls.__name__] + model_class = get_class_from_dynamic_module( + class_ref, + pretrained_model_name_or_path, + code_revision=code_revision, + **hub_kwargs, + **kwargs, + ) + _ = hub_kwargs.pop("code_revision", None) + if os.path.isdir(pretrained_model_name_or_path): + model_class.register_for_auto_class(cls.__name__) + else: + cls.register(config.__class__, model_class, exist_ok=True) + return model_class.from_pretrained( + pretrained_model_name_or_path, + *model_args, + config=config, + **hub_kwargs, + **kwargs, + ) + elif type(config) in cls._model_mapping.keys(): + model_class = _get_model_class(config, cls._model_mapping) + return model_class.from_pretrained( + pretrained_model_name_or_path, + *model_args, + config=config, + **hub_kwargs, + **kwargs, + ) + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." + ) diff --git a/outpostkit/repository/_loaders/transformers/constants.py b/outpostkit/repository/_loaders/transformers/constants.py new file mode 100644 index 0000000..4707c39 --- /dev/null +++ b/outpostkit/repository/_loaders/transformers/constants.py @@ -0,0 +1,14 @@ +PT_WEIGHTS_NAME = "pytorch_model.bin" +PT_WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json" +TF2_WEIGHTS_NAME = "tf_model.h5" +TF2_WEIGHTS_INDEX_NAME = "tf_model.h5.index.json" +TF_WEIGHTS_NAME = "model.ckpt" +FLAX_WEIGHTS_NAME = "flax_model.msgpack" +FLAX_WEIGHTS_INDEX_NAME = "flax_model.msgpack.index.json" +SAFE_WEIGHTS_NAME = "model.safetensors" +SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json" +CONFIG_NAME = "config.json" +FEATURE_EXTRACTOR_NAME = "preprocessor_config.json" +IMAGE_PROCESSOR_NAME = FEATURE_EXTRACTOR_NAME +PROCESSOR_NAME = "processor_config.json" +GENERATION_CONFIG_NAME = "generation_config.json" diff --git a/outpostkit/repository/_loaders/transformers/download.py b/outpostkit/repository/_loaders/transformers/download.py new file mode 100644 index 0000000..72697cd --- /dev/null +++ b/outpostkit/repository/_loaders/transformers/download.py @@ -0,0 +1,83 @@ +import os +from typing import Optional + +from outpostkit._types.repository import REPOSITORY_TYPES +from outpostkit._utils import save_file_at_path_from_response, split_full_name +from outpostkit.client import Client +from outpostkit.repository import RepositoryAtRef + + +def load_local_file_if_present(file_path: str): + if os.path.isfile(file_path): + with open(file_path) as file: + # Perform operations to load the file + # For example, you can read its contents: + file_contents = file.read() + return file_contents + else: + raise FileNotFoundError(f"The file '{file_path}' does not exist.") + + +def is_file_present_locally(file_path: str): + if not os.path.isfile(file_path): + raise FileNotFoundError(f"The file '{file_path}' does not exist.") + + +def download_file_from_repo( + repo_type: REPOSITORY_TYPES, + full_name: str, + file_path: str, + store_dir: str, + client: Optional[Client], + ref: str = "HEAD", +): + try: + (repo_entity, repo_name) = split_full_name(full_name) + except ValueError: + raise FileNotFoundError( + f"Invalid {repo_type} repository fullName or path {full_name}" + ) from None + + if client is None: + client = Client() + repo = RepositoryAtRef( + entity=repo_entity, + name=repo_name, + ref=ref, + repo_type=repo_type, + client=client, + ) + get_file_resp = repo.download_blob(file_path, raw=True) + file_loc = os.path.join(store_dir, file_path) + save_file_at_path_from_response(get_file_resp, file_loc) + return file_loc + + +def get_file( + full_name_or_dir: str, + repo_type: REPOSITORY_TYPES, + file_path: str, + store_dir: str, + ref: str = "HEAD", + token: Optional[str] = None, + client: Optional[Client] = None, + **kwargs, +) -> str: + subfolder = kwargs.pop("subfolder") + if subfolder is not None: + file_path = os.path.join(subfolder, file_path) + if token and not Client: + client = Client(api_token=token) + if os.path.isdir(full_name_or_dir): + file_loc = os.path.join(full_name_or_dir, file_path) + is_file_present_locally(file_loc) + return file_loc + else: + return download_file_from_repo( + repo_type=repo_type, + store_dir=store_dir, + ref=ref, + client=client, + file_path=file_path, + full_name=full_name_or_dir, + ) diff --git a/outpostkit/repository/_loaders/transformers/peft.py b/outpostkit/repository/_loaders/transformers/peft.py new file mode 100644 index 0000000..9c89512 --- /dev/null +++ b/outpostkit/repository/_loaders/transformers/peft.py @@ -0,0 +1,42 @@ +from typing import Optional + +from outpostkit.client import Client +from outpostkit.exceptions import OutpostHTTPException +from outpostkit.logger import init_outpost_logger +from outpostkit.repository._loaders.transformers.download import get_file + +ADAPTER_CONFIG_NAME = "adapter_config.json" +ADAPTER_WEIGHTS_NAME = "adapter_model.bin" +ADAPTER_SAFE_WEIGHTS_NAME = "adapter_model.safetensors" + +logger = init_outpost_logger(__name__) + + +def find_adapter_config_file( + full_name_or_dir: str, + store_dir: str, + ref: str = "HEAD", + token: Optional[str] = None, + client: Optional[Client] = None, + **kwargs, +) -> Optional[str]: + adapter_cached_filename = None + try: + adapter_cached_filename = get_file( + full_name_or_dir=full_name_or_dir, + file_path=ADAPTER_CONFIG_NAME, + repo_type="model", + store_dir=store_dir, + ref=ref, + token=token, + client=client, + **kwargs, + ) + except FileNotFoundError: + pass + except OutpostHTTPException as e: + if e.code == 404: + logger.warn("Could not find PEFT config file. continuing...") + else: + raise e + return adapter_cached_filename diff --git a/outpostkit/repository/_loaders/transformers/raw.py b/outpostkit/repository/_loaders/transformers/raw.py new file mode 100644 index 0000000..e8a06fd --- /dev/null +++ b/outpostkit/repository/_loaders/transformers/raw.py @@ -0,0 +1,387 @@ +def setup_model_for_transformers( + full_name_or_dir: str, store_dir: str, *model_args, **kwargs +): + if model_kwargs is None: + model_kwargs = {} + # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs, + # this is to keep BC). + use_auth_token = model_kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + token = use_auth_token + + code_revision = kwargs.pop("code_revision", None) + commit_hash = kwargs.pop("_commit_hash", None) + + hub_kwargs = { + "revision": revision, + "token": token, + "trust_remote_code": trust_remote_code, + "_commit_hash": commit_hash, + } + + if task is None and model is None: + raise RuntimeError( + "Impossible to instantiate a pipeline without either a task or a model " + "being specified. " + "Please provide a task class or a model" + ) + + if model is None and tokenizer is not None: + raise RuntimeError( + "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer" + " may not be compatible with the default model. Please provide a PreTrainedModel class or a" + " path/identifier to a pretrained model when providing tokenizer." + ) + if model is None and feature_extractor is not None: + raise RuntimeError( + "Impossible to instantiate a pipeline with feature_extractor specified but not the model as the provided" + " feature_extractor may not be compatible with the default model. Please provide a PreTrainedModel class" + " or a path/identifier to a pretrained model when providing feature_extractor." + ) + if isinstance(model, Path): + model = str(model) + + if commit_hash is None: + pretrained_model_name_or_path = None + if isinstance(config, str): + pretrained_model_name_or_path = config + elif config is None and isinstance(model, str): + pretrained_model_name_or_path = model + + if not isinstance(config, PretrainedConfig) and pretrained_model_name_or_path is not None: + # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible + resolved_config_file = cached_file( + pretrained_model_name_or_path, + CONFIG_NAME, + _raise_exceptions_for_gated_repo=False, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + **hub_kwargs, + ) + hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash) + else: + hub_kwargs["_commit_hash"] = getattr(config, "_commit_hash", None) + + # Config is the primordial information item. + # Instantiate config if needed + if isinstance(config, str): + config = AutoConfig.from_pretrained( + config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs + ) + hub_kwargs["_commit_hash"] = config._commit_hash + elif config is None and isinstance(model, str): + # Check for an adapter file in the model path if PEFT is available + if is_peft_available(): + # `find_adapter_config_file` doesn't accept `trust_remote_code` + _hub_kwargs = {k: v for k, v in hub_kwargs.items() if k != "trust_remote_code"} + maybe_adapter_path = find_adapter_config_file( + model, + token=hub_kwargs["token"], + revision=hub_kwargs["revision"], + _commit_hash=hub_kwargs["_commit_hash"], + ) + + if maybe_adapter_path is not None: + with open(maybe_adapter_path, encoding="utf-8") as f: + adapter_config = json.load(f) + model = adapter_config["base_model_name_or_path"] + + config = AutoConfig.from_pretrained( + model, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs + ) + hub_kwargs["_commit_hash"] = config._commit_hash + + custom_tasks = {} + if config is not None and len(getattr(config, "custom_pipelines", {})) > 0: + custom_tasks = config.custom_pipelines + if task is None and trust_remote_code is not False: + if len(custom_tasks) == 1: + task = list(custom_tasks.keys())[0] + else: + raise RuntimeError( + "We can't infer the task automatically for this model as there are multiple tasks available. Pick " + f"one in {', '.join(custom_tasks.keys())}" + ) + + if task is None and model is not None: + if not isinstance(model, str): + raise RuntimeError( + "Inferring the task automatically requires to check the hub with a model_id defined as a `str`. " + f"{model} is not a valid model_id." + ) + task = get_task(model, token) + + # Retrieve the task + if task in custom_tasks: + normalized_task = task + targeted_task, task_options = clean_custom_task(custom_tasks[task]) + if pipeline_class is None: + if not trust_remote_code: + raise ValueError( + "Loading this pipeline requires you to execute the code in the pipeline file in that" + " repo on your local machine. Make sure you have read the code there to avoid malicious use, then" + " set the option `trust_remote_code=True` to remove this error." + ) + class_ref = targeted_task["impl"] + pipeline_class = get_class_from_dynamic_module( + class_ref, + model, + code_revision=code_revision, + **hub_kwargs, + ) + else: + normalized_task, targeted_task, task_options = check_task(task) + if pipeline_class is None: + pipeline_class = targeted_task["impl"] + + # Use default model/config/tokenizer for the task if no model is provided + if model is None: + # At that point framework might still be undetermined + model, default_revision = get_default_model_and_revision(targeted_task, framework, task_options) + revision = revision if revision is not None else default_revision + logger.warning( + f"No model was supplied, defaulted to {model} and revision" + f" {revision} ({HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n" + "Using a pipeline without specifying a model name and revision in production is not recommended." + ) + if config is None and isinstance(model, str): + config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs) + hub_kwargs["_commit_hash"] = config._commit_hash + + if device_map is not None: + if "device_map" in model_kwargs: + raise ValueError( + 'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those' + " arguments might conflict, use only one.)" + ) + if device is not None: + logger.warning( + "Both `device` and `device_map` are specified. `device` will override `device_map`. You" + " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`." + ) + model_kwargs["device_map"] = device_map + if torch_dtype is not None: + if "torch_dtype" in model_kwargs: + raise ValueError( + 'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those' + " arguments might conflict, use only one.)" + ) + if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype): + torch_dtype = getattr(torch, torch_dtype) + model_kwargs["torch_dtype"] = torch_dtype + + model_name = model if isinstance(model, str) else None + + # Load the correct model if possible + # Infer the framework from the model if not already defined + if isinstance(model, str) or framework is None: + model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]} + framework, model = infer_framework_load_model( + model, + model_classes=model_classes, + config=config, + framework=framework, + task=task, + **hub_kwargs, + **model_kwargs, + ) + + model_config = model.config + hub_kwargs["_commit_hash"] = model.config._commit_hash + load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None + load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None + load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None + + # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while + # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some + # vision tasks when calling `pipeline()` with `model` and only one of the `image_processor` and `feature_extractor`. + # TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue. + # This block is only temporarily to make CI green. + if load_image_processor and load_feature_extractor: + load_feature_extractor = False + + if ( + tokenizer is None + and not load_tokenizer + and normalized_task not in NO_TOKENIZER_TASKS + # Using class name to avoid importing the real class. + and ( + model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS + or model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS + ) + ): + # This is a special category of models, that are fusions of multiple models + # so the model_config might not define a tokenizer, but it seems to be + # necessary for the task, so we're force-trying to load it. + load_tokenizer = True + if ( + image_processor is None + and not load_image_processor + and normalized_task not in NO_IMAGE_PROCESSOR_TASKS + # Using class name to avoid importing the real class. + and model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS + ): + # This is a special category of models, that are fusions of multiple models + # so the model_config might not define a tokenizer, but it seems to be + # necessary for the task, so we're force-trying to load it. + load_image_processor = True + if ( + feature_extractor is None + and not load_feature_extractor + and normalized_task not in NO_FEATURE_EXTRACTOR_TASKS + # Using class name to avoid importing the real class. + and model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS + ): + # This is a special category of models, that are fusions of multiple models + # so the model_config might not define a tokenizer, but it seems to be + # necessary for the task, so we're force-trying to load it. + load_feature_extractor = True + + if task in NO_TOKENIZER_TASKS: + # These will never require a tokenizer. + # the model on the other hand might have a tokenizer, but + # the files could be missing from the hub, instead of failing + # on such repos, we just force to not load it. + load_tokenizer = False + + if task in NO_FEATURE_EXTRACTOR_TASKS: + load_feature_extractor = False + if task in NO_IMAGE_PROCESSOR_TASKS: + load_image_processor = False + + if load_tokenizer: + # Try to infer tokenizer from model or config name (if provided as str) + if tokenizer is None: + if isinstance(model_name, str): + tokenizer = model_name + elif isinstance(config, str): + tokenizer = config + else: + # Impossible to guess what is the right tokenizer here + raise Exception( + "Impossible to guess which tokenizer to use. " + "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer." + ) + + # Instantiate tokenizer if needed + if isinstance(tokenizer, (str, tuple)): + if isinstance(tokenizer, tuple): + # For tuple we have (tokenizer name, {kwargs}) + use_fast = tokenizer[1].pop("use_fast", use_fast) + tokenizer_identifier = tokenizer[0] + tokenizer_kwargs = tokenizer[1] + else: + tokenizer_identifier = tokenizer + tokenizer_kwargs = model_kwargs.copy() + tokenizer_kwargs.pop("torch_dtype", None) + + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs + ) + + if load_image_processor: + # Try to infer image processor from model or config name (if provided as str) + if image_processor is None: + if isinstance(model_name, str): + image_processor = model_name + elif isinstance(config, str): + image_processor = config + # Backward compatibility, as `feature_extractor` used to be the name + # for `ImageProcessor`. + elif feature_extractor is not None and isinstance(feature_extractor, BaseImageProcessor): + image_processor = feature_extractor + else: + # Impossible to guess what is the right image_processor here + raise Exception( + "Impossible to guess which image processor to use. " + "Please provide a PreTrainedImageProcessor class or a path/identifier " + "to a pretrained image processor." + ) + + # Instantiate image_processor if needed + if isinstance(image_processor, (str, tuple)): + image_processor = AutoImageProcessor.from_pretrained( + image_processor, _from_pipeline=task, **hub_kwargs, **model_kwargs + ) + + if load_feature_extractor: + # Try to infer feature extractor from model or config name (if provided as str) + if feature_extractor is None: + if isinstance(model_name, str): + feature_extractor = model_name + elif isinstance(config, str): + feature_extractor = config + else: + # Impossible to guess what is the right feature_extractor here + raise Exception( + "Impossible to guess which feature extractor to use. " + "Please provide a PreTrainedFeatureExtractor class or a path/identifier " + "to a pretrained feature extractor." + ) + + # Instantiate feature_extractor if needed + if isinstance(feature_extractor, (str, tuple)): + feature_extractor = AutoFeatureExtractor.from_pretrained( + feature_extractor, _from_pipeline=task, **hub_kwargs, **model_kwargs + ) + + if ( + feature_extractor._processor_class + and feature_extractor._processor_class.endswith("WithLM") + and isinstance(model_name, str) + ): + try: + import kenlm # to trigger `ImportError` if not installed + from pyctcdecode import BeamSearchDecoderCTC + + if os.path.isdir(model_name) or os.path.isfile(model_name): + decoder = BeamSearchDecoderCTC.load_from_dir(model_name) + else: + language_model_glob = os.path.join( + BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*" + ) + alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME + allow_patterns = [language_model_glob, alphabet_filename] + decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_patterns=allow_patterns) + + kwargs["decoder"] = decoder + except ImportError as e: + logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}") + if not is_kenlm_available(): + logger.warning("Try to install `kenlm`: `pip install kenlm") + + if not is_pyctcdecode_available(): + logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode") + + if task == "translation" and model.config.task_specific_params: + for key in model.config.task_specific_params: + if key.startswith("translation"): + task = key + warnings.warn( + f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"', + UserWarning, + ) + break + + if tokenizer is not None: + kwargs["tokenizer"] = tokenizer + + if feature_extractor is not None: + kwargs["feature_extractor"] = feature_extractor + + if torch_dtype is not None: + kwargs["torch_dtype"] = torch_dtype + + if image_processor is not None: + kwargs["image_processor"] = image_processor + + if device is not None: + kwargs["device"] = device + + return pipeline_class(model=model, framework=framework, task=task, **kwargs) diff --git a/outpostkit/repository/_loaders/transformers/utils.py b/outpostkit/repository/_loaders/transformers/utils.py new file mode 100644 index 0000000..e69de29 diff --git a/outpostkit/repository/download.py b/outpostkit/repository/download.py new file mode 100644 index 0000000..5c29720 --- /dev/null +++ b/outpostkit/repository/download.py @@ -0,0 +1,5 @@ +from outpostkit.repository import Repository + + +def download_file_from_repo(full_name:str, filepath:str): + repo = Repository From a565c88095c371e77e8e92ff8460e38c6f82ded7 Mon Sep 17 00:00:00 2001 From: Ajeya Bhat Date: Thu, 25 Apr 2024 16:30:33 +0530 Subject: [PATCH 2/3] cherry pick --- outpostkit/repository/_loaders/transformers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/outpostkit/repository/_loaders/transformers/__init__.py b/outpostkit/repository/_loaders/transformers/__init__.py index 19d5eb4..c69009d 100644 --- a/outpostkit/repository/_loaders/transformers/__init__.py +++ b/outpostkit/repository/_loaders/transformers/__init__.py @@ -42,6 +42,7 @@ def setup_model_for_transformers( revision = str(kwargs.get("revision")) if token is not None: hub_kwargs["token"] = token + # if resolved is None: # if not isinstance(config, PretrainedConfig): # # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible @@ -90,7 +91,6 @@ def setup_model_for_transformers( return_unused_kwargs=True, trust_remote_code=trust_remote_code, code_revision=code_revision, - _commit_hash=commit_hash, **hub_kwargs, **kwargs, ) From bcb6f6673c1df93d65e6bca66fffaaef12d4934a Mon Sep 17 00:00:00 2001 From: Ajeya Bhat Date: Tue, 30 Apr 2024 11:13:24 +0530 Subject: [PATCH 3/3] added deps, progress --- .../_loaders/transformers/__init__.py | 417 ++++++++++++++++-- pyproject.toml | 1 + 2 files changed, 377 insertions(+), 41 deletions(-) diff --git a/outpostkit/repository/_loaders/transformers/__init__.py b/outpostkit/repository/_loaders/transformers/__init__.py index c69009d..bc5a89c 100644 --- a/outpostkit/repository/_loaders/transformers/__init__.py +++ b/outpostkit/repository/_loaders/transformers/__init__.py @@ -1,9 +1,19 @@ import copy import json import os +from typing import Optional from outpostkit._utils.import_utils import is_peft_available, is_transformers_available from outpostkit.logger import init_outpost_logger +from outpostkit.repository._loaders.transformers.constants import ( + FLAX_WEIGHTS_NAME, + PT_WEIGHTS_INDEX_NAME, + PT_WEIGHTS_NAME, + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + TF2_WEIGHTS_NAME, + TF_WEIGHTS_NAME, +) from outpostkit.repository._loaders.transformers.peft import find_adapter_config_file logger = init_outpost_logger(__name__) @@ -13,12 +23,20 @@ # MODEL_CARD_NAME = "modelcard.json" +def _add_variant(weights_name: str, variant: Optional[str] = None) -> str: + if variant is not None: + splits = weights_name.split(".") + splits = splits[:-1] + [variant] + splits[-1:] + weights_name = ".".join(splits) + + return weights_name # ref: https://github.com/huggingface/transformers/blob/a5e5c92aea1e99cb84d7342bd63826ca6cd884c4/src/transformers/models/auto/auto_factory.py#L445 def setup_model_for_transformers( full_name_or_dir: str, store_dir: str, *model_args, **kwargs ): + use_safetensors: bool = kwargs.pop("use_safetensors", None) config = kwargs.pop("config", None) trust_remote_code = kwargs.pop("trust_remote_code", None) kwargs["_from_auto"] = True @@ -103,47 +121,364 @@ def setup_model_for_transformers( has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map has_local_code = type(config) in cls._model_mapping.keys() - trust_remote_code = resolve_trust_remote_code( - trust_remote_code, - pretrained_model_name_or_path, - has_local_code, - has_remote_code, - ) - - # Set the adapter kwargs - kwargs["adapter_kwargs"] = adapter_kwargs - - if has_remote_code and trust_remote_code: - class_ref = config.auto_map[cls.__name__] - model_class = get_class_from_dynamic_module( - class_ref, - pretrained_model_name_or_path, - code_revision=code_revision, - **hub_kwargs, - **kwargs, - ) - _ = hub_kwargs.pop("code_revision", None) - if os.path.isdir(pretrained_model_name_or_path): - model_class.register_for_auto_class(cls.__name__) + + from_tf = kwargs.pop("from_tf", False) + from_flax = kwargs.pop("from_flax", False) + variant = kwargs.pop("variant", None) + subfolder = kwargs.pop("subfolder", "") + commit_hash = kwargs.pop("_commit_hash", None) + variant = kwargs.pop("variant", None) + if pretrained_model_name_or_path is not None: + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + is_local = os.path.isdir(pretrained_model_name_or_path) + if is_local: + if from_tf and os.path.isfile( + os.path.join( + pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index" + ) + ): + # Load from a TF 1.0 checkpoint in priority if from_tf + archive_file = os.path.join( + pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index" + ) + elif from_tf and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME) + ): + # Load from a TF 2.0 checkpoint in priority if from_tf + archive_file = os.path.join( + pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME + ) + elif from_flax and os.path.isfile( + os.path.join( + pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME + ) + ): + # Load from a Flax checkpoint in priority if from_flax + archive_file = os.path.join( + pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME + ) + elif use_safetensors is not False and os.path.isfile( + os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(SAFE_WEIGHTS_NAME, variant), + ) + ): + # Load from a safetensors checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(SAFE_WEIGHTS_NAME, variant), + ) + elif use_safetensors is not False and os.path.isfile( + os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + ) + ): + # Load from a sharded safetensors checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + ) + is_sharded = True + elif os.path.isfile( + os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(PT_WEIGHTS_NAME, variant), + ) + ): + # Load from a PyTorch checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(PT_WEIGHTS_NAME, variant), + ) + elif os.path.isfile( + os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(PT_WEIGHTS_INDEX_NAME, variant), + ) + ): + # Load from a sharded PyTorch checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, + subfolder, + _add_variant(PT_WEIGHTS_INDEX_NAME, variant), + ) + is_sharded = True + # At this stage we don't have a weight file so we will raise an error. + elif os.path.isfile( + os.path.join( + pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index" + ) + ) or os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME) + ): + raise OSError( + f"Error no file named {_add_variant(PT_WEIGHTS_NAME, variant)} found in directory" + f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use" + " `from_tf=True` to load this model from those weights." + ) + elif os.path.isfile( + os.path.join( + pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME + ) + ): + raise OSError( + f"Error no file named {_add_variant(PT_WEIGHTS_NAME, variant)} found in directory" + f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`" + " to load this model from those weights." + ) + elif use_safetensors: + raise OSError( + f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)} found in directory" + f" {pretrained_model_name_or_path}." + ) + else: + raise OSError( + f"Error no file named {_add_variant(PT_WEIGHTS_NAME, variant)}, {TF2_WEIGHTS_NAME}," + f" {TF_WEIGHTS_NAME + '.index'} or {FLAX_WEIGHTS_NAME} found in directory" + f" {pretrained_model_name_or_path}." + ) + elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)): + archive_file = pretrained_model_name_or_path + is_local = True + elif os.path.isfile( + os.path.join(subfolder, pretrained_model_name_or_path + ".index") + ): + if not from_tf: + raise ValueError( + f"We found a TensorFlow checkpoint at {pretrained_model_name_or_path + '.index'}, please set " + "from_tf to True to load from this checkpoint." + ) + archive_file = os.path.join( + subfolder, pretrained_model_name_or_path + ".index" + ) + is_local = True + elif is_remote_url(pretrained_model_name_or_path): + filename = pretrained_model_name_or_path + resolved_archive_file = download_url(pretrained_model_name_or_path) else: - cls.register(config.__class__, model_class, exist_ok=True) - return model_class.from_pretrained( - pretrained_model_name_or_path, - *model_args, - config=config, - **hub_kwargs, - **kwargs, - ) - elif type(config) in cls._model_mapping.keys(): - model_class = _get_model_class(config, cls._model_mapping) - return model_class.from_pretrained( + # set correct filename + if from_tf: + filename = TF2_WEIGHTS_NAME + elif from_flax: + filename = FLAX_WEIGHTS_NAME + elif use_safetensors is not False: + filename = _add_variant(SAFE_WEIGHTS_NAME, variant) + else: + filename = _add_variant(PT_WEIGHTS_NAME, variant) + + try: + # Load from URL or cache if already cached + cached_file_kwargs = { + "revision": revision, + "subfolder": subfolder, + "_raise_exceptions_for_gated_repo": False, + "_raise_exceptions_for_missing_entries": False, + "_commit_hash": commit_hash, + } + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs + ) + + # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None + # result when internet is up, the repo and revision exist, but the file does not. + if resolved_archive_file is None and filename == _add_variant( + SAFE_WEIGHTS_NAME, variant + ): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + if resolved_archive_file is not None: + is_sharded = True + elif use_safetensors: + if revision == "main": + ( + resolved_archive_file, + revision, + is_sharded, + ) = auto_conversion( + pretrained_model_name_or_path, **cached_file_kwargs + ) + cached_file_kwargs["revision"] = revision + if resolved_archive_file is None: + raise OSError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} " + "and thus cannot be loaded with `safetensors`. Please make sure that the model has " + "been saved with `safe_serialization=True` or do not set `use_safetensors=True`." + ) + else: + # This repo has no safetensors file of any kind, we switch to PyTorch. + filename = _add_variant(WEIGHTS_NAME, variant) + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + filename, + **cached_file_kwargs, + ) + if resolved_archive_file is None and filename == _add_variant( + WEIGHTS_NAME, variant + ): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + if resolved_archive_file is not None: + is_sharded = True + + if resolved_archive_file is not None: + if filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]: + # If the PyTorch file was found, check if there is a safetensors file on the repository + # If there is no safetensors file on the repositories, start an auto conversion + safe_weights_name = ( + SAFE_WEIGHTS_INDEX_NAME if is_sharded else SAFE_WEIGHTS_NAME + ) + has_file_kwargs = { + "revision": revision, + "token": token, + } + cached_file_kwargs = { + "subfolder": subfolder, + "_raise_exceptions_for_gated_repo": False, + "_raise_exceptions_for_missing_entries": False, + "_commit_hash": commit_hash, + **has_file_kwargs, + } + if not has_file( + pretrained_model_name_or_path, + safe_weights_name, + **has_file_kwargs, + ): + Thread( + target=auto_conversion, + args=(pretrained_model_name_or_path,), + kwargs={ + "ignore_errors_during_conversion": True, + **cached_file_kwargs, + }, + name="Thread-autoconversion", + ).start() + else: + # Otherwise, no PyTorch file was found, maybe there is a TF or Flax model file. + # We try those to give a helpful error message. + has_file_kwargs = { + "revision": revision, + "proxies": proxies, + "token": token, + } + if has_file( + pretrained_model_name_or_path, + TF2_WEIGHTS_NAME, + **has_file_kwargs, + ): + raise OSError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(PT_WEIGHTS_NAME, variant)} but there is a file for TensorFlow weights." + " Use `from_tf=True` to load this model from those weights." + ) + elif has_file( + pretrained_model_name_or_path, + FLAX_WEIGHTS_NAME, + **has_file_kwargs, + ): + raise OSError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(PT_WEIGHTS_NAME, variant)} but there is a file for Flax weights. Use" + " `from_flax=True` to load this model from those weights." + ) + elif variant is not None and has_file( + pretrained_model_name_or_path, + PT_WEIGHTS_NAME, + **has_file_kwargs, + ): + raise OSError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(PT_WEIGHTS_NAME, variant)} but there is a file without the variant" + f" {variant}. Use `variant=None` to load this model from those weights." + ) + else: + raise OSError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(PT_WEIGHTS_NAME, variant)}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or" + f" {FLAX_WEIGHTS_NAME}." + ) + except OSError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted + # to the original exception. + raise + except Exception as e: + # For any other exception, we throw a generic error. + raise OSError( + f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" + f" directory containing a file named {_add_variant(PT_WEIGHTS_NAME, variant)}," + f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}." + ) from e + + if is_local: + logger.info(f"loading weights file {archive_file}") + resolved_archive_file = archive_file + else: + logger.info( + f"loading weights file {filename} from cache at {resolved_archive_file}" + ) + else: + resolved_archive_file = None + + # We'll need to download and cache each checkpoint shard if the checkpoint is sharded. + if is_sharded: + # rsolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( pretrained_model_name_or_path, - *model_args, - config=config, - **hub_kwargs, - **kwargs, + resolved_archive_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _commit_hash=commit_hash, ) - raise ValueError( - f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" - f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." - ) + + if ( + is_safetensors_available() + and isinstance(resolved_archive_file, str) + and resolved_archive_file.endswith(".safetensors") + ): + with safe_open(resolved_archive_file, framework="pt") as f: + metadata = f.metadata() + + if metadata.get("format") == "pt": + pass + elif metadata.get("format") == "tf": + from_tf = True + logger.info( + "A TensorFlow safetensors file is being loaded in a PyTorch model." + ) + elif metadata.get("format") == "flax": + from_flax = True + logger.info("A Flax safetensors file is being loaded in a PyTorch model.") + elif metadata.get("format") == "mlx": + # This is a mlx file, we assume weights are compatible with pt + pass + else: + raise ValueError( + f"Incompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but {metadata.get('format')}" + ) + + from_pt = not (from_tf | from_flax) diff --git a/pyproject.toml b/pyproject.toml index ebdd0d4..674d650 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "packaging", "pydantic>1", "typing_extensions>=4.5.0", + "dataclasses_json", ] optional-dependencies = { dev = [ "pylint",