55import torch
66import numpy as np
77from gguf import *
8- from transformers import CLIPModel , CLIPProcessor
8+ from transformers import CLIPModel , CLIPProcessor , CLIPVisionModel
99
1010TEXT = "clip.text"
1111VISION = "clip.vision"
@@ -78,11 +78,19 @@ def bytes_to_unicode():
7878 help = "Save a text-only model. It can't be used to encode images" )
7979ap .add_argument ("--vision-only" , action = "store_true" , required = False ,
8080 help = "Save a vision-only model. It can't be used to encode texts" )
81+ ap .add_argument ("--clip_model_is_vision" , action = "store_true" , required = False ,
82+ help = "The clip model is a pure vision model (ShareGPT4V vision extract for example)" )
8183ap .add_argument ("--llava-projector" , help = "Path to llava.projector file. If specified, save an image encoder for LLaVA models." )
8284ap .add_argument ("--image-mean" , nargs = 3 , type = float , required = False , help = "Override image mean values" )
8385ap .add_argument ("--image-std" , nargs = 3 , type = float , required = False , help = "Override image std values" )
8486ap .add_argument ("-o" , "--output-dir" , help = "Directory to save GGUF files. Default is the original model directory" , default = None )
87+ # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711
88+ default_image_mean = [0.48145466 , 0.4578275 , 0.40821073 ]
89+ default_image_std = [0.26862954 , 0.26130258 , 0.27577711 ]
90+ ap .add_argument ('--image_mean' , type = float , nargs = '+' , help = 'Mean of the images for normalization (overrides processor) ' , default = None )
91+ ap .add_argument ('--image_std' , type = float , nargs = '+' , help = 'Standard deviation of the images for normalization (overrides processor)' , default = None )
8592
93+ # with proper
8694args = ap .parse_args ()
8795
8896
@@ -96,15 +104,22 @@ def bytes_to_unicode():
96104# output in the same directory as the model if output_dir is None
97105dir_model = args .model_dir
98106
99-
100- with open (dir_model + "/vocab.json" , "r" , encoding = "utf-8" ) as f :
101- vocab = json .load (f )
102- tokens = [key for key in vocab ]
107+ if args .clip_model_is_vision :
108+ vocab = None
109+ tokens = None
110+ else :
111+ with open (dir_model + "/vocab.json" , "r" , encoding = "utf-8" ) as f :
112+ vocab = json .load (f )
113+ tokens = [key for key in vocab ]
103114
104115with open (dir_model + "/config.json" , "r" , encoding = "utf-8" ) as f :
105116 config = json .load (f )
106- v_hparams = config ["vision_config" ]
107- t_hparams = config ["text_config" ]
117+ if args .clip_model_is_vision :
118+ v_hparams = config
119+ t_hparams = None
120+ else :
121+ v_hparams = config ["vision_config" ]
122+ t_hparams = config ["text_config" ]
108123
109124# possible data types
110125# ftype == 0 -> float32
@@ -117,9 +132,12 @@ def bytes_to_unicode():
117132if args .use_f32 :
118133 ftype = 0
119134
120-
121- model = CLIPModel .from_pretrained (dir_model )
122- processor = CLIPProcessor .from_pretrained (dir_model )
135+ if args .clip_model_is_vision :
136+ model = CLIPVisionModel .from_pretrained (dir_model )
137+ processor = None
138+ else :
139+ model = CLIPModel .from_pretrained (dir_model )
140+ processor = CLIPProcessor .from_pretrained (dir_model )
123141
124142fname_middle = None
125143has_text_encoder = True
@@ -128,13 +146,13 @@ def bytes_to_unicode():
128146if args .text_only :
129147 fname_middle = "text-"
130148 has_vision_encoder = False
131- elif args .vision_only :
132- fname_middle = "vision-"
133- has_text_encoder = False
134149elif args .llava_projector is not None :
135150 fname_middle = "mmproj-"
136151 has_text_encoder = False
137152 has_llava_projector = True
153+ elif args .vision_only :
154+ fname_middle = "vision-"
155+ has_text_encoder = False
138156else :
139157 fname_middle = ""
140158
@@ -182,8 +200,12 @@ def bytes_to_unicode():
182200 block_count = v_hparams ["num_hidden_layers" ] - 1 if has_llava_projector else v_hparams ["num_hidden_layers" ]
183201 fout .add_uint32 (k (KEY_BLOCK_COUNT , VISION ), block_count )
184202
185- image_mean = processor .image_processor .image_mean if args .image_mean is None else args .image_mean
186- image_std = processor .image_processor .image_std if args .image_std is None else args .image_std
203+ if processor is not None :
204+ image_mean = processor .image_processor .image_mean if args .image_mean is None or args .image_mean == default_image_mean else args .image_mean
205+ image_std = processor .image_processor .image_std if args .image_std is None or args .image_std == default_image_std else args .image_std
206+ else :
207+ image_mean = args .image_mean if args .image_mean is not None else default_image_mean
208+ image_std = args .image_std if args .image_std is not None else default_image_std
187209 fout .add_array ("clip.vision.image_mean" , image_mean )
188210 fout .add_array ("clip.vision.image_std" , image_std )
189211
0 commit comments