langflow/scripts/build_component_index.py

"""Build a static component index for fast startup.

This script generates a prebuilt index of all built-in components by walking
through the lfx.components package and processing each module. The index is
saved as a JSON file that can be loaded instantly at runtime, avoiding the
need to import all component modules during startup.
"""

import hashlib
import sys
from pathlib import Path

import orjson


def _get_lfx_version():
    """Get the installed lfx version.

    Components are located in LFX, so use LFX.
    """
    from importlib.metadata import version

    version = version("lfx")
    print(f"Retrieved LFX version: {version}")
    return version


def _normalize_for_determinism(obj):
    """Recursively normalize data structures for deterministic serialization.

    Sorts dictionaries by key to ensure consistent ordering. Lists are kept in
    their original order since many lists are semantically ordered (e.g., field_order,
    display_order, etc.).

    Note: If upstream code produces nondeterministic list ordering (e.g., from
    reflection or set iteration), this function will NOT fix it. Ensure lists
    are deterministically ordered before calling this function, or consider
    sorting specific list fields that are semantically unordered (e.g., tags).
    """
    if isinstance(obj, dict):
        # Recursively normalize all dict values and return sorted by keys
        return {k: _normalize_for_determinism(v) for k, v in sorted(obj.items())}
    if isinstance(obj, list):
        # Recursively normalize list items but preserve order
        # Lists like field_order, display_order, etc. are semantically ordered
        return [_normalize_for_determinism(item) for item in obj]
    # Primitive types, return as-is
    return obj


def _strip_dynamic_fields(obj):
    """Recursively remove dynamic fields that change with external dependencies.

    This prevents unnecessary hash changes and git history bloat when dependencies update.
    Timestamps are stripped to ensure deterministic builds - version is used as the timeline.
    """
    # List of field names that are dynamically populated from external sources
    # or contain runtime-specific data
    dynamic_field_names = {"timestamp", "deprecated_at"}

    if isinstance(obj, dict):
        return {k: _strip_dynamic_fields(v) for k, v in obj.items() if k not in dynamic_field_names}
    if isinstance(obj, list):
        return [_strip_dynamic_fields(item) for item in obj]
    return obj


def _import_components() -> tuple[dict, int]:
    """Import all lfx components using the async import function.

    Returns:
        Tuple of (modules_dict, components_count)

    Raises:
        RuntimeError: If component import fails
    """
    import asyncio

    from lfx.interface.components import import_langflow_components

    try:
        # Run the async function
        components_result = asyncio.run(import_langflow_components())
        modules_dict = components_result.get("components", {})
        components_count = sum(len(v) for v in modules_dict.values())
        print(f"Discovered {components_count} components across {len(modules_dict)} categories")
    except Exception as e:
        msg = f"Failed to import components: {e}"
        raise RuntimeError(msg) from e
    else:
        return modules_dict, components_count


def build_component_index() -> dict:
    """Build the component index by scanning all modules in lfx.components.

    Returns:
        A dictionary containing version, entries, and sha256 hash

    Raises:
        RuntimeError: If index cannot be built
        ValueError: If existing index is invalid
    """
    print("Building component index...")

    modules_dict, components_count = _import_components()
    current_version = _get_lfx_version()

    # Convert modules_dict to entries format and sort for determinism
    # Sort by category name (top_level) to ensure consistent ordering
    entries = []
    for category_name in sorted(modules_dict.keys()):
        # Sort components within each category by component name
        components_dict = modules_dict[category_name]
        sorted_components = {}

        for comp_name in sorted(components_dict.keys()):
            # Make defensive copies to avoid mutating the original component object
            component = dict(components_dict[comp_name])
            component["metadata"] = dict(component.get("metadata", {}))

            sorted_components[comp_name] = component

        entries.append([category_name, sorted_components])

    index = {
        "version": current_version,
        "metadata": {
            "num_modules": len(modules_dict),
            "num_components": components_count,
        },
        "entries": entries,
    }

    # Strip dynamic fields from component templates BEFORE normalization
    # This prevents changes in external dependencies (like litellm model lists) from changing the hash
    print("\nStripping dynamic fields from component metadata...")
    index = _strip_dynamic_fields(index)

    # Normalize the entire structure for deterministic output
    index = _normalize_for_determinism(index)

    # Calculate SHA256 hash for integrity verification
    # IMPORTANT: Hash is computed BEFORE adding the sha256 field itself
    # Determinism relies on BOTH:
    # 1. _normalize_for_determinism() - recursively sorts dict keys
    # 2. orjson.OPT_SORT_KEYS - ensures consistent serialization
    #
    # To verify integrity later, you must:
    # 1. Load the index
    # 2. Remove the 'sha256' field
    # 3. Serialize with OPT_SORT_KEYS
    # 4. Compare SHA256 hashes
    payload = orjson.dumps(index, option=orjson.OPT_SORT_KEYS)
    index["sha256"] = hashlib.sha256(payload).hexdigest()  # type: ignore[index]

    return index


# Standard location for component index
COMPONENT_INDEX_PATH = Path(__file__).parent.parent / "src" / "lfx" / "src" / "lfx" / "_assets" / "component_index.json"


def main():
    """Main entry point for building the component index."""
    try:
        # Build the index - will raise on any error
        index = build_component_index()
    except Exception as e:  # noqa: BLE001
        print(f"Failed to build component index: {e}", file=sys.stderr)
        sys.exit(1)

    # Use the standard component index path (defined at module level)
    output_path = COMPONENT_INDEX_PATH

    # Create directory if it doesn't exist
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Pretty-print for readable git diffs and resolvable merge conflicts
    print(f"\nWriting formatted index to {output_path}")
    json_bytes = orjson.dumps(index, option=orjson.OPT_SORT_KEYS | orjson.OPT_INDENT_2)
    output_path.write_text(json_bytes.decode("utf-8"), encoding="utf-8")

    print("\nIndex successfully written!")
    print(f"  Version: {index['version']}")
    print(f"  Modules: {index['metadata']['num_modules']}")
    print(f"  Components: {index['metadata']['num_components']}")
    print(f"  SHA256: {index['sha256']}")


if __name__ == "__main__":
    main()