Commit ee904dbf authored by narugo1992's avatar narugo1992
Browse files

dev(narugo): add docs for vit

parent 7ff62321
Loading
Loading
Loading
Loading
+49 −0
Original line number Diff line number Diff line
"""
This module provides functionality for creating image transformation pipelines specifically for Vision Transformer (ViT) models.
It includes functions to create transforms for image preprocessing tasks like resizing, rescaling, normalization and tensor conversion.
The transforms are compatible with both custom usage and Hugging Face's transformers library ViT processors.

The module supports creating transform pipelines that match the preprocessing steps used in ViT models,
ensuring images are properly prepared for model inference.
"""

from PIL import Image

from .base import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, _DEFAULT, register_creators_for_transformers, \
@@ -17,6 +26,33 @@ def create_vit_transforms(
        image_mean=_DEFAULT,
        image_std=_DEFAULT,
):
    """
    Create a composition of image transforms typically used for ViT models.

    This function creates a transform pipeline that can include resizing, tensor conversion,
    rescaling, and normalization operations. The transforms are applied in sequence to
    prepare images for ViT model input.

    :param do_resize: Whether to resize the input images
    :type do_resize: bool
    :param size: Target size for resizing, should be dict with 'height' and 'width' keys
    :type size: dict
    :param resample: PIL resampling filter to use for resizing
    :type resample: int
    :param do_rescale: Whether to rescale pixel values
    :type do_rescale: bool
    :param rescale_factor: Factor to use for rescaling pixel values
    :type rescale_factor: float
    :param do_normalize: Whether to normalize the image
    :type do_normalize: bool
    :param image_mean: Mean values for normalization
    :type image_mean: tuple or list
    :param image_std: Standard deviation values for normalization
    :type image_std: tuple or list

    :return: A composition of image transforms
    :rtype: PillowCompose
    """
    # Initialize default values
    size = size if size is not _DEFAULT else _DEFAULT_SIZE
    image_mean = image_mean if image_mean is not _DEFAULT else IMAGENET_DEFAULT_MEAN
@@ -50,6 +86,19 @@ def create_vit_transforms(

@register_creators_for_transformers()
def create_transforms_from_vit_processor(processor):
    """
    Create image transforms from a Hugging Face ViT processor configuration.

    This function takes a ViT image processor from the transformers library and creates
    a matching transform pipeline that replicates the processor's preprocessing steps.

    :param processor: A ViT image processor from Hugging Face transformers
    :type processor: ViTImageProcessor

    :return: A composition of image transforms matching the processor's configuration
    :rtype: PillowCompose
    :raises NotProcessorTypeError: If the provided processor is not a ViTImageProcessor
    """
    _check_transformers()
    from transformers import ViTImageProcessor