Updated the comment in llm_core (#465)

* add api_test * modified api_test.py * update comments in llm_core * update comments in llm_core * add directions of api test
2025-03-08 23:19:41 -05:00
parent 57046b5a05
commit 00c76f0d2e
9 changed files with 419 additions and 66 deletions
--- a/aios/llm_core/local.py
+++ b/aios/llm_core/local.py
@@ -1,16 +1,27 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from litellm import completion

-from transformers import pipeline
-
-import re
-
 import os

 from aios.config.config_manager import config

 class HfLocalBackend:
-    def __init__(self, model_name, max_gpu_memory=None, eval_device=None, hostname=None):
+    """
+    A backend class for loading and interacting with Hugging Face local models. 
+    Supports both local execution and hosted inference if a hostname is provided.
+    """
+
+    def __init__(self, model_name, device="auto", max_gpu_memory=None, hostname=None):
+        """
+        Initializes the Hugging Face local backend.
+
+        Args:
+            model_name (str): The name of the model to load.
+            device (str, optional): The device to load the model on (default is "auto").
+            max_gpu_memory (str, optional): Maximum GPU memory allocation.
+            hostname (str, optional): The hostname for a hosted HF instance. If provided, 
+                                      the model will not be loaded locally.
+        """
        print("\n=== HfLocalBackend Initialization ===")
        print(f"Model name: {model_name}")
        print(f"Checking HF API key:")
@@ -18,8 +29,8 @@ class HfLocalBackend:
        print(f"HF_AUTH_TOKEN in env: {'Yes' if 'HF_AUTH_TOKEN' in os.environ else 'No'}")
        
        self.model_name = model_name
+        self.device = device
        self.max_gpu_memory = max_gpu_memory
-        self.eval_device = eval_device
        self.hostname = hostname

        # If a hostname is given, then this HF instance is hosted as a web server.
@@ -29,18 +40,29 @@ class HfLocalBackend:
        
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
-            device_map="auto",
+            device_map=device,
            max_memory=self.max_gpu_memory,
-            use_auth_token=os.environ["HF_TOKEN"],
+            use_auth_token=os.environ["HUGGING_FACE_API_KEY"],
        )
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
-            device_map="auto",
-            use_auth_token=os.environ["HF_TOKEN"]
+            device_map=device,
+            use_auth_token=os.environ["HUGGING_FACE_API_KEY"]
        )
        self.tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}"

    def inference_online(self, messages, temperature, stream=False):
+        """
+        Sends inference requests to a remote Hugging Face model hosted at the specified hostname.
+
+        Args:
+            messages (list): The chat messages for inference.
+            temperature (float): Sampling temperature for response generation.
+            stream (bool, optional): Whether to stream responses (default is False).
+        
+        Returns:
+            str: The generated response content.
+        """
        return completion(
            model="huggingface/" + self.model_name,
            messages=messages,
@@ -54,69 +76,224 @@ class HfLocalBackend:
        temperature,
        stream=False,
    ):
+        """
+        Generates a response from the locally loaded Hugging Face model or a remote hosted model.
+
+        Args:
+            messages (list): The chat messages for inference.
+            temperature (float): Sampling temperature for response generation.
+            stream (bool, optional): Whether to stream responses (default is False).
+
+        Returns:
+            str: The generated response.
+        """
        if self.hostname is not None:
            return self.inference_online(messages, temperature, stream=stream)
        
        if stream:
            raise NotImplemented

-        inputs = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
+        inputs = self.tokenizer.apply_chat_template(messages,
+                                                       tokenize=True,
+                                                       add_generation_prompt=True,
+                                                       return_dict=True,
+                                                       return_tensors="pt")
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
        temperature = temperature if temperature > 0.5 else 0.5
-        response  = self.model.generate(
-            **inputs,
-            temperature=temperature,
-            max_length=4096,
-            top_k=10,
-            num_beams=4,
-            early_stopping=True,
-            do_sample=True,
-            num_return_sequences=1,
-            eos_token_id=self.tokenizer.eos_token_id
-        )
-        length = inputs["input_ids"].shape[1]
-        result = self.tokenizer.decode(response[0][length:]).replace("assistant\n\n", "")
+        response  = self.model.generate(**inputs,
+                                        temperature=temperature,
+                                        max_length=4096,
+                                        top_k=10,
+                                        num_beams=4,
+                                        early_stopping=True,
+                                        do_sample=True,
+                                        num_return_sequences=1,
+                                        eos_token_id=self.tokenizer.eos_token_id)
+        length    = inputs["input_ids"].shape[1]
+        result    = self.tokenizer.decode(response[0][length:])

        return result
-    
-    def generate(self, messages, temperature=1.0, tools=None, max_length=1024):
-        # breakpoint()
+
+class VLLMLocalBackend:
+    """
+    The VLLMLocalBackend class provides an interface for loading and interacting with vLLM models, 
+    supporting both local execution and hosted inference. It allows seamless switching between 
+    local model execution and remote API-based inference.
+
+    Attributes:
+        model_name (str): The name of the model to be loaded.
+        device (str): The device to load the model on (default: "auto").
+        max_gpu_memory (Optional[str]): Specifies the maximum GPU memory allocation.
+        hostname (Optional[str]): URL for a hosted vLLM instance. If provided, the model 
+                                will not be loaded locally.
+
+    Example:
+        ```python
+        backend = VLLMLocalBackend(model_name="mistral-7b", device="cuda", max_gpu_memory="16GB")
+        
+        messages = [{"role": "user", "content": "Tell me a joke."}]
+        response = backend(messages, temperature=0.7)
+        print(response)
+        ```
+    """
+
+    def __init__(self, model_name, device="auto", max_gpu_memory=None, hostname=None):
+        """
+        Initializes the vLLM local backend.
+
+        Args:
+            model_name (str): The name of the model to load.
+            device (str, optional): The device to load the model on (default is "auto").
+            max_gpu_memory (str, optional): Maximum GPU memory allocation.
+            hostname (str, optional): The hostname for a hosted vLLM instance. If provided, 
+                                      the model will not be loaded locally.
+        """
+        print("\n=== VLLMLocalBackend Initialization ===")
+        print(f"Model name: {model_name}")
+        
+        self.model_name = model_name
+        self.device = device
+        self.max_gpu_memory = max_gpu_memory
+        # self.hostname = hostname
+        self.hostname = "http://localhost:8001"
+
+        # If a hostname is given, then this vLLM instance is hosted as a web server.
+        # Therefore, do not start the AIOS-based vLLM instance.
+        if self.hostname is not None:
+            return
+
+        try:
+            import vllm
+
+            self.model = vllm.LLM(
+                model_name,
+                tensor_parallel_size=1 if max_gpu_memory is None else len(max_gpu_memory)
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.sampling_params = vllm.SamplingParams(temperature=temperature)
+            
+        except ImportError:
+            raise ImportError("Could not import vllm Python package"
+                              "Please install it with `pip install python`")
+        except Exception as err:
+            print("Error loading vllm model:", err)
+
+    def inference_online(self, messages, temperature, stream=False):
+        breakpoint()
+        """
+        Sends inference requests to a hosted vLLM instance.
+
+        Args:
+            messages (List[Dict[str, str]]): A list of messages in chat format.
+            temperature (float): Controls randomness in response generation.
+            stream (bool, optional): Whether to use streaming mode (not implemented).
+
+        Returns:
+            str: The generated response from the hosted vLLM instance.
+        """
+        return completion(
+            model="hosted_vllm/" + self.model_name,
+            messages=messages,
+            temperature=temperature,
+            api_base=self.hostname,
+        ).choices[0].message.content
+
+    def __call__(
+        self,
+        messages,
+        temperature,
+        stream=False,
+    ):
+        """
+        Generates a response using the vLLM model.
+
+        Args:
+            messages (List[Dict[str, str]]): A list of chat messages.
+            temperature (float): Controls randomness in response generation.
+            stream (bool, optional): Whether to use streaming mode (not implemented).
+
+        Returns:
+            str: The generated response text.
+        """
        if self.hostname is not None:
            return self.inference_online(messages, temperature, stream=stream)
+        
+        assert self.model
+        assert self.sampling_params
+        # breakpoint()
+        if stream:
+            raise NotImplemented
+
+        # parameters = vllm.SamplingParams(temperature=temperature)
+        prompt     = self.tokenizer.apply_chat_template(messages,
+                                                        tokenize=False)
+        response   = self.model.generate(prompt, self.sampling_params)
+        result     = response[0].outputs[0].text

-        inputs = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=True,
-            # add_generation_prompt=True,
-            tools=tools,
-            return_dict=True,
-            return_tensors="pt"
-        )
-        inputs = {k: v.to(self.eval_device) for k, v in inputs.items()}
-        temperature = temperature if temperature > 0.5 else 0.5
-        response  = self.model.generate(
-            **inputs,
-            temperature=temperature,
-            max_length=max_length,
-            top_k=10,
-            num_beams=4,
-            early_stopping=True,
-            do_sample=True,
-            num_return_sequences=1,
-            eos_token_id=self.tokenizer.eos_token_id
-        )
-        length = inputs["input_ids"].shape[1]
-        result = self.tokenizer.decode(response[0][length:], skip_special_tokens=True)
-        
-        breakpoint()
-        
-        result = re.sub(r'^\s*assistant[：:]?\s*|\<\/?assistant\>|assistant\n\n', '', result, flags=re.IGNORECASE)
-        result = result.lstrip()
-        
        return result
+
+class OllamaBackend:
+    """
+    The OllamaBackend class provides an interface for interacting with Ollama models, 
+    supporting both local and remote inference via API requests.
+
+    Attributes:
+        model_name (str): The name of the model to be used.
+        hostname (str): The API base URL for a hosted Ollama instance. Defaults to "http://localhost:11434".
+
+    Example:
+        ```python
+        backend = OllamaBackend(model_name="mistral-7b")
+        
+        messages = [{"role": "user", "content": "Explain quantum entanglement."}]
+        response = backend(messages, temperature=0.7)
+        print(response)
+        ```
+    """
+
+    def __init__(self, model_name, device="auto", max_gpu_memory=None, hostname=None):
+        """
+        Initializes the backend, setting up the model and determining the inference mode.
+
+        Args:
+            model_name (str): The name of the model to use.
+            device (str, optional): The device for model execution (default is "auto").
+            max_gpu_memory (str, optional): Maximum GPU memory allocation (not currently used).
+            hostname (str, optional): The hostname for a hosted Ollama instance. If not provided, 
+                                      it defaults to "http://localhost:11434".
+        """
+
+        print("\n=== OllamaBackend Initialization ===")
+        print(f"Model name: {model_name}")
+        print(f"Hostname: {hostname or 'http://localhost:11434'}")
+        
+        self.model_name = model_name
+        self.hostname = hostname or "http://localhost:11434"
+        
+    def __call__(
+        self,
+        messages,
+        temperature,
+        # tools=None,
+        stream=False,
+    ):
+        """
+        Sends an inference request to the specified Ollama model and returns the generated response.
+
+        Args:
+            messages (List[Dict[str, str]]): A list of chat messages in dialogue format.
+            temperature (float): Controls randomness in response generation.
+            stream (bool, optional): Whether to use streaming mode (not implemented).
+
+        Returns:
+            str: The generated response from the Ollama model.
+        """
+        res = completion(
+            model="ollama/" + self.model_name,
+            messages=messages,
+            temperature=temperature,
+            # tools=tools,
+            api_base=self.hostname
+        ).choices[0].message.content
+        # breakpoint()
+        return res
--- a/aios/llm_core/strategy.py
+++ b/aios/llm_core/strategy.py
@@ -18,14 +18,51 @@ class RouterStrategy(Enum):
    SIMPLE = 0,

 class SimpleStrategy:
+    """
+    The SimpleStrategy class implements a round-robin selection strategy for load-balancing LLM requests. 
+    It iterates through a list of selected language models and returns their corresponding index based on 
+    the request count.
+
+    This strategy ensures that multiple models are utilized in sequence, distributing queries evenly across the available configurations.
+
+    Args:
+        llm_configs (List[Dict[str, Any]]): A list of LLM configurations, where each dictionary contains model information such as name, backend, and other optional parameters.
+
+    Example:
+        ```python
+        configs = [
+            {"name": "gpt-4o-mini", "backend": "openai"},
+            {"name": "qwen2.5-7b", "backend": "ollama"}
+        ]
+
+        selected_llms = [
+            {"name": "gpt-4o-mini"},
+            {"name": "qwen2.5-7b"}
+        ]
+
+        strategy = SimpleStrategy(llm_configs=configs)
+        model_idxs = strategy.get_model_idxs(selected_llms, n_queries=3)
+        ```
+    """
    def __init__(self, llm_configs: List[Dict[str, Any]]):
        self.llm_configs = llm_configs
-        self.idx = 0
+        self.idx = 0 # internal index to track the current model in the round-robin selection.

    # def __call__(self):
    #     return self.get_model()

    def get_model_idxs(self, selected_llms: List[str], n_queries: int=1):
+        """
+        Selects model indices from the available LLM configurations using a round-robin strategy.
+
+        Args:
+            selected_llms (List[str]): A list of selected LLM names from which models will be chosen.
+            n_queries (int): The number of queries to distribute among the selected models. Defaults to 1.
+
+        Returns:
+            List[int]: A list of indices corresponding to the selected models in `self.llm_configs`.
+
+        """
        # current  = self.selected_llms[self.idx]
        model_idxs = []
        
--- a/aios/llm_core/utils.py
+++ b/aios/llm_core/utils.py
@@ -3,11 +3,32 @@ import re
 import uuid

 def tool_calling_input_format(messages: list, tools: list) -> list:
-    """Integrate tool information into the messages for open-sourced LLMs
+    """
+    Integrate tool information into the messages for open-sourced LLMs.

    Args:
-        messages (list): messages with different roles
-        tools (list): tool information
+        messages (list): A list of message dictionaries, each containing at least a "role" 
+                         and "content" field. Some messages may contain "tool_calls".
+        tools (list): A list of available tool definitions, formatted as dictionaries.
+
+    Returns:
+        list: The updated messages list, where:
+              - Tool call messages are formatted properly for models without built-in tool support.
+              - Messages indicating tool execution results are transformed into a user message.
+              - The last message includes an instruction prompt detailing tool usage requirements.
+
+    Example:
+        ```python
+        messages = [
+            {"role": "user", "content": "Translate 'hello' to French."},
+            {"role": "assistant", "tool_calls": [{"name": "translate", "parameters": {"text": "hello", "language": "fr"}}]}
+        ]
+        
+        tools = [{"name": "translate", "description": "Translates text into another language."}]
+        
+        updated_messages = tool_calling_input_format(messages, tools)
+        print(updated_messages)
+        ```
    """
    prefix_prompt = (
        "In and only in current step, you need to call tools. Available tools are: "
@@ -39,6 +60,22 @@ def tool_calling_input_format(messages: list, tools: list) -> list:
    return messages

 def parse_json_format(message: str) -> str:
+    """
+    Extract and parse a JSON object or array from a given string.
+
+    Args:
+        message (str): The input string potentially containing a JSON object or array.
+
+    Returns:
+        str: A string representation of the extracted JSON object or array.
+    
+    Example:
+        ```python
+        message = "Here is some data: {\"key\": \"value\"}"
+        parsed_json = parse_json_format(message)
+        print(parsed_json)  # Output: '{"key": "value"}'
+        ```
+    """
    json_array_pattern = r"\[\s*\{.*?\}\s*\]"
    json_object_pattern = r"\{\s*.*?\s*\}"

@@ -66,9 +103,43 @@ def parse_json_format(message: str) -> str:
    return "[]"

 def generator_tool_call_id():
+    """
+    Generate a unique identifier for a tool call.
+
+    This function creates a new UUID (Universally Unique Identifier) and returns it as a string.
+
+    Returns:
+        str: A unique tool call ID.
+    
+    Example:
+        ```python
+        tool_call_id = generator_tool_call_id()
+        print(tool_call_id)  # Example output: 'f3f2e850-b5d4-11ef-ac7e-96584d5248b2'
+        ```
+    """
    return str(uuid.uuid4())

 def decode_litellm_tool_calls(response):
+    """
+    Decode tool call responses from LiteLLM API format.
+
+    Args:
+        response: The response object from LiteLLM API.
+
+    Returns:
+        list: A list of dictionaries, each containing:
+              - "name": The name of the function being called.
+              - "parameters": The arguments passed to the function.
+              - "id": The unique identifier of the tool call.
+
+    Example:
+        ```python
+        response = <LiteLLM API response>
+        decoded_calls = decode_litellm_tool_calls(response)
+        print(decoded_calls)  
+        # Output: [{'name': 'translate', 'parameters': {'text': 'hello', 'lang': 'fr'}, 'id': 'uuid1234'}]
+        ```
+    """
    tool_calls = response.choices[0].message.tool_calls
    
    decoded_tool_calls = []
@@ -84,6 +155,23 @@ def decode_litellm_tool_calls(response):
    return decoded_tool_calls

 def parse_tool_calls(message):
+    """
+    Parse and process tool calls from a message string.
+
+    Args:
+        message (str): A JSON string representing tool calls.
+
+    Returns:
+        list: A list of processed tool calls with unique IDs.
+
+    Example:
+        ```python
+        message = '[{"name": "text_translate", "parameters": {"text": "hello", "lang": "fr"}}]'
+        parsed_calls = parse_tool_calls(message)
+        print(parsed_calls)  
+        # Output: [{'name': 'text/translate', 'parameters': {'text': 'hello', 'lang': 'fr'}, 'id': 'uuid1234'}]
+        ```
+    """
    # add tool call id and type for models don't support tool call
    # if isinstance(message, dict):
    #     message = [message]
@@ -103,6 +191,23 @@ def parse_tool_calls(message):
    return tool_calls

 def slash_to_double_underscore(tools):
+    """
+    Convert function names by replacing slashes ("/") with double underscores ("__").
+
+    Args:
+        tools (list): A list of tool dictionaries.
+
+    Returns:
+        list: The updated tools list with function names formatted properly.
+
+    Example:
+        ```python
+        tools = [{"function": {"name": "text/translate"}}]
+        formatted_tools = slash_to_double_underscore(tools)
+        print(formatted_tools)  
+        # Output: [{'function': {'name': 'text__translate'}}]
+        ```
+    """
    for tool in tools:
        tool_name = tool["function"]["name"]
        if "/" in tool_name:
@@ -111,12 +216,46 @@ def slash_to_double_underscore(tools):
    return tools

 def double_underscore_to_slash(tool_calls):
+    """
+    Convert function names by replacing double underscores ("__") back to slashes ("/").
+
+    Args:
+        tool_calls (list): A list of tool call dictionaries.
+
+    Returns:
+        list: The updated tool calls list with function names restored to their original format.
+
+    Example:
+        ```python
+        tool_calls = [{"name": "text__translate", "parameters": '{"text": "hello", "lang": "fr"}'}]
+        restored_calls = double_underscore_to_slash(tool_calls)
+        print(restored_calls)  
+        # Output: [{'name': 'text/translate', 'parameters': {'text': 'hello', 'lang': 'fr'}}]
+        ```
+    """
    for tool_call in tool_calls:
        tool_call["name"] = tool_call["name"].replace("__", "/")
        tool_call["parameters"] = json.loads(tool_call["parameters"])
    return tool_calls

 def pre_process_tools(tools):
+    """
+    Pre-process tool definitions by replacing slashes ("/") with double underscores ("__").
+
+    Args:
+        tools (list): A list of tool dictionaries.
+
+    Returns:
+        list: The processed tools list with modified function names.
+
+    Example:
+        ```python
+        tools = [{"function": {"name": "text/translate"}}]
+        preprocessed_tools = pre_process_tools(tools)
+        print(preprocessed_tools)  
+        # Output: [{'function': {'name': 'text__translate'}}]
+        ```
+    """
    for tool in tools:
        tool_name = tool["function"]["name"]
        if "/" in tool_name:
--- a/tests/modules/agent_load/agent_load_api_test.py
+++ b/tests/modules/agent_load/agent_load_api_test.py
--- a/tests/modules/llm/llm_api_test.py
+++ b/tests/modules/llm/llm_api_test.py
--- a/tests/modules/memory/memory_api_test.py
+++ b/tests/modules/memory/memory_api_test.py
--- a/tests/modules/storage/storage_api_test.py
+++ b/tests/modules/storage/storage_api_test.py
--- a/tests/modules/tool/tool_api_test.py
+++ b/tests/modules/tool/tool_api_test.py
--- a/tests/modules/tool_load/tool_load_api_test.py
+++ b/tests/modules/tool_load/tool_load_api_test.py