Loading Makefile +6 −5 Original line number Diff line number Diff line Loading @@ -50,13 +50,14 @@ pdocs: dataset: mkdir -p ${DATASET_DIR} if [ ! -d ${DATASET_DIR}/chafen_arknights ]; then \ git clone https://huggingface.co/datasets/deepghs/chafen_arknights.git ${DATASET_DIR}/chafen_arknights; \ hfutils download -r deepghs/chafen_arknights -t dataset -d . -o ${DATASET_DIR}/chafen_arknights; \ fi if [ ! -d ${DATASET_DIR}/monochrome_danbooru ]; then \ git clone https://huggingface.co/datasets/deepghs/monochrome_danbooru.git ${DATASET_DIR}/monochrome_danbooru; \ hfutils download -r deepghs/monochrome_danbooru -t dataset -d . -o ${DATASET_DIR}/monochrome_danbooru; \ fi if [ ! -d ${DATASET_DIR}/images_test_v1 ]; then \ mkdir -p ${DATASET_DIR}/images_test_v1 && \ curl -L -o ${DATASET_DIR}/images_test_v1/images_test_v1.tar.xz https://huggingface.co/datasets/deepghs/character_similarity/resolve/main/images_test_v1.tar.xz && \ cd ${DATASET_DIR}/images_test_v1 && tar -xvf images_test_v1.tar.xz && rm -rf *.tar.xz; \ hfutils download -r deepghs/character_similarity -t dataset -a images_test_v1.tar.xz -o ${DATASET_DIR}/images_test_v1; \ fi if [ ! -d ${DATASET_DIR}/unsplash_1000 ]; then \ hfutils download -r deepghs/realutils_unittest -a unsplash_1000.zip -o ${DATASET_DIR}/unsplash_1000; \ fi imgutils/generic/__init__.py +1 −0 Original line number Diff line number Diff line Loading @@ -4,4 +4,5 @@ Overview: """ from .classify import * from .enhance import * from .siglip import * from .yolo import * imgutils/generic/siglip.py +102 −16 Original line number Diff line number Diff line Loading @@ -10,7 +10,14 @@ from tokenizers import Tokenizer from ..data import MultiImagesTyping, load_images from ..preprocess import create_pillow_transforms from ..utils import open_onnx_model, vreplace, sigmoid from ..utils import open_onnx_model, vreplace, sigmoid, ts_lru_cache __all__ = [ 'SigLIPModel', 'siglip_image_encode', 'siglip_text_encode', 'siglip_predict', ] class SigLIPModel: Loading Loading @@ -90,6 +97,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._image_encoders: self._check_model_name(model_name) self._image_encoders[model_name] = open_onnx_model(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -109,6 +117,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._image_preprocessors: self._check_model_name(model_name) with open(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -129,6 +138,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._text_encoders: self._check_model_name(model_name) self._text_encoders[model_name] = open_onnx_model(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -148,6 +158,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._text_tokenizers: self._check_model_name(model_name) self._text_tokenizers[model_name] = Tokenizer.from_file(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -167,6 +178,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._logit_scales: self._check_model_name(model_name) with open(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -177,7 +189,19 @@ class SigLIPModel: return self._logit_scales[model_name] def get_siglip_image_embedding(self, images: MultiImagesTyping, model_name: str, fmt: Any = 'embeddings'): def _get_siglip_image_embedding(self, images: MultiImagesTyping, model_name: str, fmt: Any = 'embeddings'): preprocessor = self._open_image_preprocessor(model_name) model = self._open_image_encoder(model_name) images = load_images(images, mode='RGB', force_background='white') input_ = np.stack([preprocessor(image) for image in images]) encodings, embeddings = model.run(['encodings', 'embeddings'], {'pixel_values': input_}) return vreplace(fmt, { 'encodings': encodings, 'embeddings': embeddings, }) def image_encode(self, images: MultiImagesTyping, model_name: str, fmt: Any = 'embeddings'): """ Generate embeddings for input images using the SigLIP model. Loading @@ -189,18 +213,13 @@ class SigLIPModel: :return: Image embeddings or encodings based on fmt parameter """ preprocessor = self._open_image_preprocessor(model_name) model = self._open_image_encoder(model_name) images = load_images(images, mode='RGB', force_background='white') input_ = np.stack([preprocessor(image) for image in images]) encodings, embeddings = model.run(['encodings', 'embeddings'], {'pixel_values': input_}) return vreplace(fmt, { 'encodings': encodings, 'embeddings': embeddings, }) return self._get_siglip_image_embedding( images=images, model_name=model_name, fmt=fmt, ) def get_siglip_text_embedding(self, texts: Union[str, List[str]], model_name: str, fmt: Any = 'embeddings'): def _get_siglip_text_embedding(self, texts: Union[str, List[str]], model_name: str, fmt: Any = 'embeddings'): """ Generate embeddings for input texts using the SigLIP model. Loading @@ -227,7 +246,25 @@ class SigLIPModel: 'embeddings': embeddings, }) def classify_with_siglip( def text_encode(self, texts: Union[str, List[str]], model_name: str, fmt: Any = 'embeddings'): """ Generate embeddings for input texts using the SigLIP model. :param texts: Input text or list of texts :type texts: Union[str, List[str]] :param model_name: Name of the SigLIP model variant to use :type model_name: str :param fmt: Output format, either 'encodings' or 'embeddings' :return: Text embeddings or encodings based on fmt parameter """ return self._get_siglip_text_embedding( texts=texts, model_name=model_name, fmt=fmt, ) def predict( self, images: Union[MultiImagesTyping, np.ndarray], texts: Union[List[str], str, np.ndarray], Loading @@ -250,7 +287,7 @@ class SigLIPModel: extra_values = {} if not isinstance(images, np.ndarray): image_embeddings, image_encodings = \ self.get_siglip_image_embedding(images, model_name=model_name, fmt=('embeddings', 'encodings')) self._get_siglip_image_embedding(images, model_name=model_name, fmt=('embeddings', 'encodings')) extra_values['image_embeddings'] = image_embeddings extra_values['image_encodings'] = image_encodings images = image_embeddings Loading @@ -258,7 +295,7 @@ class SigLIPModel: if not isinstance(texts, np.ndarray): text_embeddings, text_encodings = \ self.get_siglip_text_embedding(texts, model_name=model_name, fmt=('embeddings', 'encodings')) self._get_siglip_text_embedding(texts, model_name=model_name, fmt=('embeddings', 'encodings')) extra_values['text_embeddings'] = text_embeddings extra_values['text_encodings'] = text_encodings texts = text_embeddings Loading @@ -275,3 +312,52 @@ class SigLIPModel: 'predictions': predictions, **extra_values, }) def clear(self): self._image_encoders.clear() self._image_preprocessors.clear() self._text_encoders.clear() self._text_tokenizers.clear() self._logit_scales.clear() @ts_lru_cache() def _open_models_for_repo_id(repo_id: str, hf_token: Optional[str] = None) -> SigLIPModel: return SigLIPModel(repo_id, hf_token=hf_token) def siglip_image_encode(images: MultiImagesTyping, repo_id: str, model_name: str, fmt: Any = 'embeddings', hf_token: Optional[str] = None): model = _open_models_for_repo_id(repo_id, hf_token=hf_token) return model.image_encode( images=images, model_name=model_name, fmt=fmt, ) def siglip_text_encode(texts: Union[str, List[str]], repo_id: str, model_name: str, fmt: Any = 'embeddings', hf_token: Optional[str] = None): model = _open_models_for_repo_id(repo_id, hf_token=hf_token) return model.text_encode( texts=texts, model_name=model_name, fmt=fmt, ) def siglip_predict( images: Union[MultiImagesTyping, np.ndarray], texts: Union[List[str], str, np.ndarray], repo_id: str, model_name: str, fmt: Any = 'predictions', hf_token: Optional[str] = None, ): model = _open_models_for_repo_id(repo_id, hf_token=hf_token) return model.predict( images=images, texts=texts, model_name=model_name, fmt=fmt ) test/generic/test_siglip.py 0 → 100644 +86 −0 Original line number Diff line number Diff line import re import numpy as np import pytest from imgutils.generic import siglip_image_encode, siglip_text_encode, siglip_predict from imgutils.generic.siglip import _open_models_for_repo_id from test.testings import get_testfile @pytest.fixture(scope='module') def siglip_repo_id(): return 'deepghs/siglip_onnx' @pytest.fixture(scope='module') def siglip_model_name(): return 'google/siglip-base-patch16-256-multilingual' @pytest.fixture(scope='module', autouse=True) def _release_model_after_run(siglip_repo_id): try: yield finally: _open_models_for_repo_id(siglip_repo_id).clear() @pytest.mark.unittest class TestGenericSiglip: @pytest.mark.parametrize(['name'], [ ('unsplash_sZzmhn2xjQY',), ('unsplash_S-8ntPEsSwo',), ('unsplash_tB4-ftQ4zyI',), ('unsplash_l6KamCXeB4U',), ('unsplash__9dAwWA4LD8',), ('unsplash_LlsAieNJE70',), ('unsplash_HWIOLU7_O6w',), ('unsplash_1AAa78W_Ezc',), ('unsplash_0TPmrjTXjSs',), ('unsplash_0yAVtZiYkJY',) ]) def test_siglip_image_encode(self, name, siglip_repo_id, siglip_model_name): src_image = get_testfile('dataset', 'unsplash_1000', f'{name}.jpg') dst_npy = get_testfile('siglip', 'unsplash_1000', f'{name}.npy') embedding = siglip_image_encode(src_image, repo_id=siglip_repo_id, model_name=siglip_model_name) expected_embedding = np.load(dst_npy) np.testing.assert_allclose(embedding, expected_embedding, rtol=1e-03, atol=1e-05) @pytest.mark.parametrize(['text'], [ ("a red car parked on the street",), ("beautiful sunset over mountain landscape",), ("two cats playing with yarn",), ("fresh fruits in a wooden bowl",), ("person reading book under tree",), ("colorful hot air balloon in blue sky",), ("children playing soccer in the park",), ("rustic cabin surrounded by pine trees",), ("waves crashing on sandy beach",), ("chef cooking in modern kitchen",), ]) def test_siglip_text_encode(self, text, siglip_repo_id, siglip_model_name): dst_npy = get_testfile('siglip', 'text', re.sub(r'[\W_]+', '_', text).strip('_') + '.npy') embedding = siglip_text_encode(text, repo_id=siglip_repo_id, model_name=siglip_model_name) expected_embedding = np.load(dst_npy) np.testing.assert_allclose(embedding, expected_embedding, rtol=1e-03, atol=1e-05) def test_siglip_predict(self, siglip_repo_id, siglip_model_name): result = siglip_predict( images=[ get_testfile('clip_cats.jpg'), get_testfile('idolsankaku', '3.jpg'), ], texts=[ 'a photo of a cat', 'a photo of 2 cats', 'a photo of 2 dogs', 'a photo of a woman', ], repo_id=siglip_repo_id, model_name=siglip_model_name, ) expected_result = np.array( [[0.0013782851165160537, 0.27010253071784973, 9.751768811838701e-05, 3.6702780814579228e-09], [1.2790776438009743e-08, 4.396981001519862e-09, 3.2838454178119036e-10, 1.0559210750216153e-06]]) np.testing.assert_allclose(result, expected_result, atol=3e-4) test/testfile/clip/text/a_red_car_parked_on_the_street.npy 0 → 100644 +2.13 KiB File added.No diff preview for this file type. View file Loading
Makefile +6 −5 Original line number Diff line number Diff line Loading @@ -50,13 +50,14 @@ pdocs: dataset: mkdir -p ${DATASET_DIR} if [ ! -d ${DATASET_DIR}/chafen_arknights ]; then \ git clone https://huggingface.co/datasets/deepghs/chafen_arknights.git ${DATASET_DIR}/chafen_arknights; \ hfutils download -r deepghs/chafen_arknights -t dataset -d . -o ${DATASET_DIR}/chafen_arknights; \ fi if [ ! -d ${DATASET_DIR}/monochrome_danbooru ]; then \ git clone https://huggingface.co/datasets/deepghs/monochrome_danbooru.git ${DATASET_DIR}/monochrome_danbooru; \ hfutils download -r deepghs/monochrome_danbooru -t dataset -d . -o ${DATASET_DIR}/monochrome_danbooru; \ fi if [ ! -d ${DATASET_DIR}/images_test_v1 ]; then \ mkdir -p ${DATASET_DIR}/images_test_v1 && \ curl -L -o ${DATASET_DIR}/images_test_v1/images_test_v1.tar.xz https://huggingface.co/datasets/deepghs/character_similarity/resolve/main/images_test_v1.tar.xz && \ cd ${DATASET_DIR}/images_test_v1 && tar -xvf images_test_v1.tar.xz && rm -rf *.tar.xz; \ hfutils download -r deepghs/character_similarity -t dataset -a images_test_v1.tar.xz -o ${DATASET_DIR}/images_test_v1; \ fi if [ ! -d ${DATASET_DIR}/unsplash_1000 ]; then \ hfutils download -r deepghs/realutils_unittest -a unsplash_1000.zip -o ${DATASET_DIR}/unsplash_1000; \ fi
imgutils/generic/__init__.py +1 −0 Original line number Diff line number Diff line Loading @@ -4,4 +4,5 @@ Overview: """ from .classify import * from .enhance import * from .siglip import * from .yolo import *
imgutils/generic/siglip.py +102 −16 Original line number Diff line number Diff line Loading @@ -10,7 +10,14 @@ from tokenizers import Tokenizer from ..data import MultiImagesTyping, load_images from ..preprocess import create_pillow_transforms from ..utils import open_onnx_model, vreplace, sigmoid from ..utils import open_onnx_model, vreplace, sigmoid, ts_lru_cache __all__ = [ 'SigLIPModel', 'siglip_image_encode', 'siglip_text_encode', 'siglip_predict', ] class SigLIPModel: Loading Loading @@ -90,6 +97,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._image_encoders: self._check_model_name(model_name) self._image_encoders[model_name] = open_onnx_model(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -109,6 +117,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._image_preprocessors: self._check_model_name(model_name) with open(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -129,6 +138,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._text_encoders: self._check_model_name(model_name) self._text_encoders[model_name] = open_onnx_model(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -148,6 +158,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._text_tokenizers: self._check_model_name(model_name) self._text_tokenizers[model_name] = Tokenizer.from_file(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -167,6 +178,7 @@ class SigLIPModel: """ with self._model_lock: if model_name not in self._logit_scales: self._check_model_name(model_name) with open(hf_hub_download( repo_id=self.repo_id, repo_type='model', Loading @@ -177,7 +189,19 @@ class SigLIPModel: return self._logit_scales[model_name] def get_siglip_image_embedding(self, images: MultiImagesTyping, model_name: str, fmt: Any = 'embeddings'): def _get_siglip_image_embedding(self, images: MultiImagesTyping, model_name: str, fmt: Any = 'embeddings'): preprocessor = self._open_image_preprocessor(model_name) model = self._open_image_encoder(model_name) images = load_images(images, mode='RGB', force_background='white') input_ = np.stack([preprocessor(image) for image in images]) encodings, embeddings = model.run(['encodings', 'embeddings'], {'pixel_values': input_}) return vreplace(fmt, { 'encodings': encodings, 'embeddings': embeddings, }) def image_encode(self, images: MultiImagesTyping, model_name: str, fmt: Any = 'embeddings'): """ Generate embeddings for input images using the SigLIP model. Loading @@ -189,18 +213,13 @@ class SigLIPModel: :return: Image embeddings or encodings based on fmt parameter """ preprocessor = self._open_image_preprocessor(model_name) model = self._open_image_encoder(model_name) images = load_images(images, mode='RGB', force_background='white') input_ = np.stack([preprocessor(image) for image in images]) encodings, embeddings = model.run(['encodings', 'embeddings'], {'pixel_values': input_}) return vreplace(fmt, { 'encodings': encodings, 'embeddings': embeddings, }) return self._get_siglip_image_embedding( images=images, model_name=model_name, fmt=fmt, ) def get_siglip_text_embedding(self, texts: Union[str, List[str]], model_name: str, fmt: Any = 'embeddings'): def _get_siglip_text_embedding(self, texts: Union[str, List[str]], model_name: str, fmt: Any = 'embeddings'): """ Generate embeddings for input texts using the SigLIP model. Loading @@ -227,7 +246,25 @@ class SigLIPModel: 'embeddings': embeddings, }) def classify_with_siglip( def text_encode(self, texts: Union[str, List[str]], model_name: str, fmt: Any = 'embeddings'): """ Generate embeddings for input texts using the SigLIP model. :param texts: Input text or list of texts :type texts: Union[str, List[str]] :param model_name: Name of the SigLIP model variant to use :type model_name: str :param fmt: Output format, either 'encodings' or 'embeddings' :return: Text embeddings or encodings based on fmt parameter """ return self._get_siglip_text_embedding( texts=texts, model_name=model_name, fmt=fmt, ) def predict( self, images: Union[MultiImagesTyping, np.ndarray], texts: Union[List[str], str, np.ndarray], Loading @@ -250,7 +287,7 @@ class SigLIPModel: extra_values = {} if not isinstance(images, np.ndarray): image_embeddings, image_encodings = \ self.get_siglip_image_embedding(images, model_name=model_name, fmt=('embeddings', 'encodings')) self._get_siglip_image_embedding(images, model_name=model_name, fmt=('embeddings', 'encodings')) extra_values['image_embeddings'] = image_embeddings extra_values['image_encodings'] = image_encodings images = image_embeddings Loading @@ -258,7 +295,7 @@ class SigLIPModel: if not isinstance(texts, np.ndarray): text_embeddings, text_encodings = \ self.get_siglip_text_embedding(texts, model_name=model_name, fmt=('embeddings', 'encodings')) self._get_siglip_text_embedding(texts, model_name=model_name, fmt=('embeddings', 'encodings')) extra_values['text_embeddings'] = text_embeddings extra_values['text_encodings'] = text_encodings texts = text_embeddings Loading @@ -275,3 +312,52 @@ class SigLIPModel: 'predictions': predictions, **extra_values, }) def clear(self): self._image_encoders.clear() self._image_preprocessors.clear() self._text_encoders.clear() self._text_tokenizers.clear() self._logit_scales.clear() @ts_lru_cache() def _open_models_for_repo_id(repo_id: str, hf_token: Optional[str] = None) -> SigLIPModel: return SigLIPModel(repo_id, hf_token=hf_token) def siglip_image_encode(images: MultiImagesTyping, repo_id: str, model_name: str, fmt: Any = 'embeddings', hf_token: Optional[str] = None): model = _open_models_for_repo_id(repo_id, hf_token=hf_token) return model.image_encode( images=images, model_name=model_name, fmt=fmt, ) def siglip_text_encode(texts: Union[str, List[str]], repo_id: str, model_name: str, fmt: Any = 'embeddings', hf_token: Optional[str] = None): model = _open_models_for_repo_id(repo_id, hf_token=hf_token) return model.text_encode( texts=texts, model_name=model_name, fmt=fmt, ) def siglip_predict( images: Union[MultiImagesTyping, np.ndarray], texts: Union[List[str], str, np.ndarray], repo_id: str, model_name: str, fmt: Any = 'predictions', hf_token: Optional[str] = None, ): model = _open_models_for_repo_id(repo_id, hf_token=hf_token) return model.predict( images=images, texts=texts, model_name=model_name, fmt=fmt )
test/generic/test_siglip.py 0 → 100644 +86 −0 Original line number Diff line number Diff line import re import numpy as np import pytest from imgutils.generic import siglip_image_encode, siglip_text_encode, siglip_predict from imgutils.generic.siglip import _open_models_for_repo_id from test.testings import get_testfile @pytest.fixture(scope='module') def siglip_repo_id(): return 'deepghs/siglip_onnx' @pytest.fixture(scope='module') def siglip_model_name(): return 'google/siglip-base-patch16-256-multilingual' @pytest.fixture(scope='module', autouse=True) def _release_model_after_run(siglip_repo_id): try: yield finally: _open_models_for_repo_id(siglip_repo_id).clear() @pytest.mark.unittest class TestGenericSiglip: @pytest.mark.parametrize(['name'], [ ('unsplash_sZzmhn2xjQY',), ('unsplash_S-8ntPEsSwo',), ('unsplash_tB4-ftQ4zyI',), ('unsplash_l6KamCXeB4U',), ('unsplash__9dAwWA4LD8',), ('unsplash_LlsAieNJE70',), ('unsplash_HWIOLU7_O6w',), ('unsplash_1AAa78W_Ezc',), ('unsplash_0TPmrjTXjSs',), ('unsplash_0yAVtZiYkJY',) ]) def test_siglip_image_encode(self, name, siglip_repo_id, siglip_model_name): src_image = get_testfile('dataset', 'unsplash_1000', f'{name}.jpg') dst_npy = get_testfile('siglip', 'unsplash_1000', f'{name}.npy') embedding = siglip_image_encode(src_image, repo_id=siglip_repo_id, model_name=siglip_model_name) expected_embedding = np.load(dst_npy) np.testing.assert_allclose(embedding, expected_embedding, rtol=1e-03, atol=1e-05) @pytest.mark.parametrize(['text'], [ ("a red car parked on the street",), ("beautiful sunset over mountain landscape",), ("two cats playing with yarn",), ("fresh fruits in a wooden bowl",), ("person reading book under tree",), ("colorful hot air balloon in blue sky",), ("children playing soccer in the park",), ("rustic cabin surrounded by pine trees",), ("waves crashing on sandy beach",), ("chef cooking in modern kitchen",), ]) def test_siglip_text_encode(self, text, siglip_repo_id, siglip_model_name): dst_npy = get_testfile('siglip', 'text', re.sub(r'[\W_]+', '_', text).strip('_') + '.npy') embedding = siglip_text_encode(text, repo_id=siglip_repo_id, model_name=siglip_model_name) expected_embedding = np.load(dst_npy) np.testing.assert_allclose(embedding, expected_embedding, rtol=1e-03, atol=1e-05) def test_siglip_predict(self, siglip_repo_id, siglip_model_name): result = siglip_predict( images=[ get_testfile('clip_cats.jpg'), get_testfile('idolsankaku', '3.jpg'), ], texts=[ 'a photo of a cat', 'a photo of 2 cats', 'a photo of 2 dogs', 'a photo of a woman', ], repo_id=siglip_repo_id, model_name=siglip_model_name, ) expected_result = np.array( [[0.0013782851165160537, 0.27010253071784973, 9.751768811838701e-05, 3.6702780814579228e-09], [1.2790776438009743e-08, 4.396981001519862e-09, 3.2838454178119036e-10, 1.0559210750216153e-06]]) np.testing.assert_allclose(result, expected_result, atol=3e-4)
test/testfile/clip/text/a_red_car_parked_on_the_street.npy 0 → 100644 +2.13 KiB File added.No diff preview for this file type. View file