Skip to content

Commit

Permalink
download model without loading
Browse files Browse the repository at this point in the history
  • Loading branch information
BobaZooba committed Dec 6, 2023
1 parent bb2b5a6 commit 4aa17cb
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 14 deletions.
9 changes: 4 additions & 5 deletions src/xllm/cli/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple, Type
from typing import Type

from transformers import HfArgumentParser, PreTrainedModel, PreTrainedTokenizer
from transformers import HfArgumentParser

from ..core.config import Config
from ..run.prepare import prepare
Expand All @@ -23,7 +23,7 @@

def cli_run_prepare(
config_cls: Type[Config] = Config,
) -> Tuple[PreTrainedTokenizer, PreTrainedModel]:
) -> None:
"""
Provides a command-line interface (CLI) entry point for setting up a tokenizer and model based on a configuration.
Expand Down Expand Up @@ -64,8 +64,7 @@ def cli_run_prepare(
parser = HfArgumentParser(config_cls)
config = parser.parse_args_into_dataclasses()[0]
setup_cli(config=config, logger_path="./xllm_prepare.log")
tokenizer, model = prepare(config=config)
return tokenizer, model
prepare(config=config)


if __name__ == "__main__":
Expand Down
28 changes: 19 additions & 9 deletions src/xllm/run/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,18 @@
# limitations under the License.

import json
from typing import Tuple

from loguru import logger
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
PreTrainedModel,
PreTrainedTokenizer,
)
from transformers.modeling_utils import CONFIG_NAME, cached_file

from ..core.config import Config
from ..datasets.registry import datasets_registry


def prepare(config: Config) -> Tuple[PreTrainedTokenizer, PreTrainedModel]:
def prepare(config: Config) -> None:
"""
Prepares the tokenizer and model for use from the provided configuration, and optionally prepares a dataset
if specified.
Expand Down Expand Up @@ -89,10 +86,23 @@ def prepare(config: Config) -> Tuple[PreTrainedTokenizer, PreTrainedModel]:
else:
logger.warning("Dataset is not prepared because this set in config")

tokenizer = AutoTokenizer.from_pretrained(config.correct_tokenizer_name_or_path)
# tokenizer
_ = AutoTokenizer.from_pretrained(config.correct_tokenizer_name_or_path)
logger.info(f"Tokenizer {config.correct_tokenizer_name_or_path} loaded")

model = AutoModelForCausalLM.from_pretrained(config.model_name_or_path)
# model
cached_file(
config.model_name_or_path,
CONFIG_NAME,
cache_dir=None,
force_download=False,
resume_download=False,
proxies=None,
local_files_only=False,
token=None,
revision="main",
subfolder="",
_raise_exceptions_for_missing_entries=False,
_raise_exceptions_for_connection_errors=False,
)
logger.info(f"Model {config.model_name_or_path} loaded")

return tokenizer, model

0 comments on commit 4aa17cb

Please sign in to comment.