Updated the comment in llm_core (#465)

* add api_test

* modified api_test.py

* update comments in llm_core

* update comments in llm_core

* add directions of api test
This commit is contained in:
Jingyuan Huang
2025-03-08 23:19:41 -05:00
committed by GitHub
parent 57046b5a05
commit 00c76f0d2e
9 changed files with 419 additions and 66 deletions

View File

@@ -1,16 +1,27 @@
from transformers import AutoTokenizer, AutoModelForCausalLM
from litellm import completion
from transformers import pipeline
import re
import os
from aios.config.config_manager import config
class HfLocalBackend:
def __init__(self, model_name, max_gpu_memory=None, eval_device=None, hostname=None):
"""
A backend class for loading and interacting with Hugging Face local models.
Supports both local execution and hosted inference if a hostname is provided.
"""
def __init__(self, model_name, device="auto", max_gpu_memory=None, hostname=None):
"""
Initializes the Hugging Face local backend.
Args:
model_name (str): The name of the model to load.
device (str, optional): The device to load the model on (default is "auto").
max_gpu_memory (str, optional): Maximum GPU memory allocation.
hostname (str, optional): The hostname for a hosted HF instance. If provided,
the model will not be loaded locally.
"""
print("\n=== HfLocalBackend Initialization ===")
print(f"Model name: {model_name}")
print(f"Checking HF API key:")
@@ -18,8 +29,8 @@ class HfLocalBackend:
print(f"HF_AUTH_TOKEN in env: {'Yes' if 'HF_AUTH_TOKEN' in os.environ else 'No'}")
self.model_name = model_name
self.device = device
self.max_gpu_memory = max_gpu_memory
self.eval_device = eval_device
self.hostname = hostname
# If a hostname is given, then this HF instance is hosted as a web server.
@@ -29,18 +40,29 @@ class HfLocalBackend:
self.model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto",
device_map=device,
max_memory=self.max_gpu_memory,
use_auth_token=os.environ["HF_TOKEN"],
use_auth_token=os.environ["HUGGING_FACE_API_KEY"],
)
self.tokenizer = AutoTokenizer.from_pretrained(
model_name,
device_map="auto",
use_auth_token=os.environ["HF_TOKEN"]
device_map=device,
use_auth_token=os.environ["HUGGING_FACE_API_KEY"]
)
self.tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}"
def inference_online(self, messages, temperature, stream=False):
"""
Sends inference requests to a remote Hugging Face model hosted at the specified hostname.
Args:
messages (list): The chat messages for inference.
temperature (float): Sampling temperature for response generation.
stream (bool, optional): Whether to stream responses (default is False).
Returns:
str: The generated response content.
"""
return completion(
model="huggingface/" + self.model_name,
messages=messages,
@@ -54,69 +76,224 @@ class HfLocalBackend:
temperature,
stream=False,
):
"""
Generates a response from the locally loaded Hugging Face model or a remote hosted model.
Args:
messages (list): The chat messages for inference.
temperature (float): Sampling temperature for response generation.
stream (bool, optional): Whether to stream responses (default is False).
Returns:
str: The generated response.
"""
if self.hostname is not None:
return self.inference_online(messages, temperature, stream=stream)
if stream:
raise NotImplemented
inputs = self.tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = self.tokenizer.apply_chat_template(messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt")
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
temperature = temperature if temperature > 0.5 else 0.5
response = self.model.generate(
**inputs,
temperature=temperature,
max_length=4096,
top_k=10,
num_beams=4,
early_stopping=True,
do_sample=True,
num_return_sequences=1,
eos_token_id=self.tokenizer.eos_token_id
)
length = inputs["input_ids"].shape[1]
result = self.tokenizer.decode(response[0][length:]).replace("assistant\n\n", "")
response = self.model.generate(**inputs,
temperature=temperature,
max_length=4096,
top_k=10,
num_beams=4,
early_stopping=True,
do_sample=True,
num_return_sequences=1,
eos_token_id=self.tokenizer.eos_token_id)
length = inputs["input_ids"].shape[1]
result = self.tokenizer.decode(response[0][length:])
return result
def generate(self, messages, temperature=1.0, tools=None, max_length=1024):
# breakpoint()
class VLLMLocalBackend:
"""
The VLLMLocalBackend class provides an interface for loading and interacting with vLLM models,
supporting both local execution and hosted inference. It allows seamless switching between
local model execution and remote API-based inference.
Attributes:
model_name (str): The name of the model to be loaded.
device (str): The device to load the model on (default: "auto").
max_gpu_memory (Optional[str]): Specifies the maximum GPU memory allocation.
hostname (Optional[str]): URL for a hosted vLLM instance. If provided, the model
will not be loaded locally.
Example:
```python
backend = VLLMLocalBackend(model_name="mistral-7b", device="cuda", max_gpu_memory="16GB")
messages = [{"role": "user", "content": "Tell me a joke."}]
response = backend(messages, temperature=0.7)
print(response)
```
"""
def __init__(self, model_name, device="auto", max_gpu_memory=None, hostname=None):
"""
Initializes the vLLM local backend.
Args:
model_name (str): The name of the model to load.
device (str, optional): The device to load the model on (default is "auto").
max_gpu_memory (str, optional): Maximum GPU memory allocation.
hostname (str, optional): The hostname for a hosted vLLM instance. If provided,
the model will not be loaded locally.
"""
print("\n=== VLLMLocalBackend Initialization ===")
print(f"Model name: {model_name}")
self.model_name = model_name
self.device = device
self.max_gpu_memory = max_gpu_memory
# self.hostname = hostname
self.hostname = "http://localhost:8001"
# If a hostname is given, then this vLLM instance is hosted as a web server.
# Therefore, do not start the AIOS-based vLLM instance.
if self.hostname is not None:
return
try:
import vllm
self.model = vllm.LLM(
model_name,
tensor_parallel_size=1 if max_gpu_memory is None else len(max_gpu_memory)
)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.sampling_params = vllm.SamplingParams(temperature=temperature)
except ImportError:
raise ImportError("Could not import vllm Python package"
"Please install it with `pip install python`")
except Exception as err:
print("Error loading vllm model:", err)
def inference_online(self, messages, temperature, stream=False):
breakpoint()
"""
Sends inference requests to a hosted vLLM instance.
Args:
messages (List[Dict[str, str]]): A list of messages in chat format.
temperature (float): Controls randomness in response generation.
stream (bool, optional): Whether to use streaming mode (not implemented).
Returns:
str: The generated response from the hosted vLLM instance.
"""
return completion(
model="hosted_vllm/" + self.model_name,
messages=messages,
temperature=temperature,
api_base=self.hostname,
).choices[0].message.content
def __call__(
self,
messages,
temperature,
stream=False,
):
"""
Generates a response using the vLLM model.
Args:
messages (List[Dict[str, str]]): A list of chat messages.
temperature (float): Controls randomness in response generation.
stream (bool, optional): Whether to use streaming mode (not implemented).
Returns:
str: The generated response text.
"""
if self.hostname is not None:
return self.inference_online(messages, temperature, stream=stream)
assert self.model
assert self.sampling_params
# breakpoint()
if stream:
raise NotImplemented
# parameters = vllm.SamplingParams(temperature=temperature)
prompt = self.tokenizer.apply_chat_template(messages,
tokenize=False)
response = self.model.generate(prompt, self.sampling_params)
result = response[0].outputs[0].text
inputs = self.tokenizer.apply_chat_template(
messages,
tokenize=True,
# add_generation_prompt=True,
tools=tools,
return_dict=True,
return_tensors="pt"
)
inputs = {k: v.to(self.eval_device) for k, v in inputs.items()}
temperature = temperature if temperature > 0.5 else 0.5
response = self.model.generate(
**inputs,
temperature=temperature,
max_length=max_length,
top_k=10,
num_beams=4,
early_stopping=True,
do_sample=True,
num_return_sequences=1,
eos_token_id=self.tokenizer.eos_token_id
)
length = inputs["input_ids"].shape[1]
result = self.tokenizer.decode(response[0][length:], skip_special_tokens=True)
breakpoint()
result = re.sub(r'^\s*assistant[:]?\s*|\<\/?assistant\>|assistant\n\n', '', result, flags=re.IGNORECASE)
result = result.lstrip()
return result
class OllamaBackend:
"""
The OllamaBackend class provides an interface for interacting with Ollama models,
supporting both local and remote inference via API requests.
Attributes:
model_name (str): The name of the model to be used.
hostname (str): The API base URL for a hosted Ollama instance. Defaults to "http://localhost:11434".
Example:
```python
backend = OllamaBackend(model_name="mistral-7b")
messages = [{"role": "user", "content": "Explain quantum entanglement."}]
response = backend(messages, temperature=0.7)
print(response)
```
"""
def __init__(self, model_name, device="auto", max_gpu_memory=None, hostname=None):
"""
Initializes the backend, setting up the model and determining the inference mode.
Args:
model_name (str): The name of the model to use.
device (str, optional): The device for model execution (default is "auto").
max_gpu_memory (str, optional): Maximum GPU memory allocation (not currently used).
hostname (str, optional): The hostname for a hosted Ollama instance. If not provided,
it defaults to "http://localhost:11434".
"""
print("\n=== OllamaBackend Initialization ===")
print(f"Model name: {model_name}")
print(f"Hostname: {hostname or 'http://localhost:11434'}")
self.model_name = model_name
self.hostname = hostname or "http://localhost:11434"
def __call__(
self,
messages,
temperature,
# tools=None,
stream=False,
):
"""
Sends an inference request to the specified Ollama model and returns the generated response.
Args:
messages (List[Dict[str, str]]): A list of chat messages in dialogue format.
temperature (float): Controls randomness in response generation.
stream (bool, optional): Whether to use streaming mode (not implemented).
Returns:
str: The generated response from the Ollama model.
"""
res = completion(
model="ollama/" + self.model_name,
messages=messages,
temperature=temperature,
# tools=tools,
api_base=self.hostname
).choices[0].message.content
# breakpoint()
return res

View File

@@ -18,14 +18,51 @@ class RouterStrategy(Enum):
SIMPLE = 0,
class SimpleStrategy:
"""
The SimpleStrategy class implements a round-robin selection strategy for load-balancing LLM requests.
It iterates through a list of selected language models and returns their corresponding index based on
the request count.
This strategy ensures that multiple models are utilized in sequence, distributing queries evenly across the available configurations.
Args:
llm_configs (List[Dict[str, Any]]): A list of LLM configurations, where each dictionary contains model information such as name, backend, and other optional parameters.
Example:
```python
configs = [
{"name": "gpt-4o-mini", "backend": "openai"},
{"name": "qwen2.5-7b", "backend": "ollama"}
]
selected_llms = [
{"name": "gpt-4o-mini"},
{"name": "qwen2.5-7b"}
]
strategy = SimpleStrategy(llm_configs=configs)
model_idxs = strategy.get_model_idxs(selected_llms, n_queries=3)
```
"""
def __init__(self, llm_configs: List[Dict[str, Any]]):
self.llm_configs = llm_configs
self.idx = 0
self.idx = 0 # internal index to track the current model in the round-robin selection.
# def __call__(self):
# return self.get_model()
def get_model_idxs(self, selected_llms: List[str], n_queries: int=1):
"""
Selects model indices from the available LLM configurations using a round-robin strategy.
Args:
selected_llms (List[str]): A list of selected LLM names from which models will be chosen.
n_queries (int): The number of queries to distribute among the selected models. Defaults to 1.
Returns:
List[int]: A list of indices corresponding to the selected models in `self.llm_configs`.
"""
# current = self.selected_llms[self.idx]
model_idxs = []

View File

@@ -3,11 +3,32 @@ import re
import uuid
def tool_calling_input_format(messages: list, tools: list) -> list:
"""Integrate tool information into the messages for open-sourced LLMs
"""
Integrate tool information into the messages for open-sourced LLMs.
Args:
messages (list): messages with different roles
tools (list): tool information
messages (list): A list of message dictionaries, each containing at least a "role"
and "content" field. Some messages may contain "tool_calls".
tools (list): A list of available tool definitions, formatted as dictionaries.
Returns:
list: The updated messages list, where:
- Tool call messages are formatted properly for models without built-in tool support.
- Messages indicating tool execution results are transformed into a user message.
- The last message includes an instruction prompt detailing tool usage requirements.
Example:
```python
messages = [
{"role": "user", "content": "Translate 'hello' to French."},
{"role": "assistant", "tool_calls": [{"name": "translate", "parameters": {"text": "hello", "language": "fr"}}]}
]
tools = [{"name": "translate", "description": "Translates text into another language."}]
updated_messages = tool_calling_input_format(messages, tools)
print(updated_messages)
```
"""
prefix_prompt = (
"In and only in current step, you need to call tools. Available tools are: "
@@ -39,6 +60,22 @@ def tool_calling_input_format(messages: list, tools: list) -> list:
return messages
def parse_json_format(message: str) -> str:
"""
Extract and parse a JSON object or array from a given string.
Args:
message (str): The input string potentially containing a JSON object or array.
Returns:
str: A string representation of the extracted JSON object or array.
Example:
```python
message = "Here is some data: {\"key\": \"value\"}"
parsed_json = parse_json_format(message)
print(parsed_json) # Output: '{"key": "value"}'
```
"""
json_array_pattern = r"\[\s*\{.*?\}\s*\]"
json_object_pattern = r"\{\s*.*?\s*\}"
@@ -66,9 +103,43 @@ def parse_json_format(message: str) -> str:
return "[]"
def generator_tool_call_id():
"""
Generate a unique identifier for a tool call.
This function creates a new UUID (Universally Unique Identifier) and returns it as a string.
Returns:
str: A unique tool call ID.
Example:
```python
tool_call_id = generator_tool_call_id()
print(tool_call_id) # Example output: 'f3f2e850-b5d4-11ef-ac7e-96584d5248b2'
```
"""
return str(uuid.uuid4())
def decode_litellm_tool_calls(response):
"""
Decode tool call responses from LiteLLM API format.
Args:
response: The response object from LiteLLM API.
Returns:
list: A list of dictionaries, each containing:
- "name": The name of the function being called.
- "parameters": The arguments passed to the function.
- "id": The unique identifier of the tool call.
Example:
```python
response = <LiteLLM API response>
decoded_calls = decode_litellm_tool_calls(response)
print(decoded_calls)
# Output: [{'name': 'translate', 'parameters': {'text': 'hello', 'lang': 'fr'}, 'id': 'uuid1234'}]
```
"""
tool_calls = response.choices[0].message.tool_calls
decoded_tool_calls = []
@@ -84,6 +155,23 @@ def decode_litellm_tool_calls(response):
return decoded_tool_calls
def parse_tool_calls(message):
"""
Parse and process tool calls from a message string.
Args:
message (str): A JSON string representing tool calls.
Returns:
list: A list of processed tool calls with unique IDs.
Example:
```python
message = '[{"name": "text_translate", "parameters": {"text": "hello", "lang": "fr"}}]'
parsed_calls = parse_tool_calls(message)
print(parsed_calls)
# Output: [{'name': 'text/translate', 'parameters': {'text': 'hello', 'lang': 'fr'}, 'id': 'uuid1234'}]
```
"""
# add tool call id and type for models don't support tool call
# if isinstance(message, dict):
# message = [message]
@@ -103,6 +191,23 @@ def parse_tool_calls(message):
return tool_calls
def slash_to_double_underscore(tools):
"""
Convert function names by replacing slashes ("/") with double underscores ("__").
Args:
tools (list): A list of tool dictionaries.
Returns:
list: The updated tools list with function names formatted properly.
Example:
```python
tools = [{"function": {"name": "text/translate"}}]
formatted_tools = slash_to_double_underscore(tools)
print(formatted_tools)
# Output: [{'function': {'name': 'text__translate'}}]
```
"""
for tool in tools:
tool_name = tool["function"]["name"]
if "/" in tool_name:
@@ -111,12 +216,46 @@ def slash_to_double_underscore(tools):
return tools
def double_underscore_to_slash(tool_calls):
"""
Convert function names by replacing double underscores ("__") back to slashes ("/").
Args:
tool_calls (list): A list of tool call dictionaries.
Returns:
list: The updated tool calls list with function names restored to their original format.
Example:
```python
tool_calls = [{"name": "text__translate", "parameters": '{"text": "hello", "lang": "fr"}'}]
restored_calls = double_underscore_to_slash(tool_calls)
print(restored_calls)
# Output: [{'name': 'text/translate', 'parameters': {'text': 'hello', 'lang': 'fr'}}]
```
"""
for tool_call in tool_calls:
tool_call["name"] = tool_call["name"].replace("__", "/")
tool_call["parameters"] = json.loads(tool_call["parameters"])
return tool_calls
def pre_process_tools(tools):
"""
Pre-process tool definitions by replacing slashes ("/") with double underscores ("__").
Args:
tools (list): A list of tool dictionaries.
Returns:
list: The processed tools list with modified function names.
Example:
```python
tools = [{"function": {"name": "text/translate"}}]
preprocessed_tools = pre_process_tools(tools)
print(preprocessed_tools)
# Output: [{'function': {'name': 'text__translate'}}]
```
"""
for tool in tools:
tool_name = tool["function"]["name"]
if "/" in tool_name:

View File

View File