html/api/ingest-oss-skills-batch2.py

#!/usr/bin/env python3
"""
Batch 2 — 150+ additional AI/ML/dev patterns to push weval_skills past 20000 (gap P3 threshold).
Each pattern gets 10 variations = ~1500 new points.
Deterministic IDs (offset from batch 1) → idempotent.
"""
import sys, json, time, hashlib, urllib.request
from datetime import datetime

QDRANT_URL = "http://localhost:6333"
COLLECTION = "weval_skills"
BATCH_SIZE = 128
ID_OFFSET = 10_000_000  # avoid collision with batch 1

# 150+ more curated entries (AI companies, major models, popular tools, concepts)
EXTRA = [
    # AI labs / companies (no personal names)
    ("Anthropic", "anthropic.com", "AI safety lab — creators of Claude, Constitutional AI, interpretability research leaders", "ai-lab", ["safety","claude","constitutional"]),
    ("OpenAI", "openai.com", "Creators of GPT/ChatGPT/Sora/o1 — largest commercial AI lab", "ai-lab", ["gpt","chatgpt","o1"]),
    ("Google DeepMind", "deepmind.google", "Google's AI lab — Gemini, Gemma, AlphaFold, game-playing AIs", "ai-lab", ["google","gemini","alphafold"]),
    ("Meta AI (FAIR)", "ai.meta.com", "Meta's fundamental AI research — Llama, PyTorch, SAM, MusicGen", "ai-lab", ["meta","llama","pytorch"]),
    ("Mistral AI", "mistral.ai", "French open-weights AI lab — Mistral 7B, Mixtral MoE, Codestral, Large 2", "ai-lab", ["france","open-weights","moe"]),
    ("Cohere", "cohere.com", "Enterprise LLM provider — Command models, Embed v3, Rerank, RAG-focused", "ai-lab", ["enterprise","embed","rag"]),
    ("Stability AI", "stability.ai", "Open image/video/audio generation — Stable Diffusion family, SVD, audio models", "ai-lab", ["open","image","video"]),
    ("xAI", "x.ai", "Elon's AI lab — Grok models, integrated with X (Twitter), competitive w/ GPT-4", "ai-lab", ["grok","twitter","competitive"]),
    ("AI21 Labs", "ai21.com", "Israeli NLP lab — Jamba (Mamba-Transformer hybrid), Jurassic models, enterprise focus", "ai-lab", ["israel","jamba","hybrid"]),
    ("DeepSeek", "deepseek.com", "Chinese AI startup — DeepSeek V3/R1/Coder, incredible price/performance, open weights", "ai-lab", ["china","open","r1"]),
    ("Alibaba DAMO", "damo.alibaba.com", "Alibaba's research — Qwen models, multimodal, SOTA open-source", "ai-lab", ["alibaba","qwen","open"]),
    ("BAAI", "baai.ac.cn", "Beijing AI Academy — BGE embeddings, Aquila, EVA vision models", "ai-lab", ["china","embeddings","vision"]),
    ("AI2 / Allen Institute", "allenai.org", "Paul Allen's AI research — OLMo, Tülu, open science LLMs", "ai-lab", ["open-science","olmo","tulu"]),
    ("EleutherAI", "eleuther.ai", "Grassroots open research — GPT-Neo/J/NeoX, The Pile dataset, early open LLMs", "ai-lab", ["grassroots","gpt-neo","pile"]),
    ("HuggingFace", "huggingface.co", "Model hub + transformers library — 1M+ models, datasets, inference endpoints", "ai-platform", ["hub","transformers","mlops"]),
    ("Together AI", "together.ai", "OSS LLM inference + fine-tuning — RedPajama, compete on price/quality", "ai-platform", ["inference","rep-pajama","pricing"]),
    ("Fireworks AI", "fireworks.ai", "Fast LLM inference — FireOptimizer, multi-LoRA, serverless", "ai-platform", ["fast","multi-lora","serverless"]),
    ("Groq", "groq.com", "LPU-accelerated LLM inference — 300+ tokens/sec, deterministic latency", "ai-platform", ["lpu","ultra-fast","deterministic"]),
    ("SambaNova", "sambanova.ai", "Dataflow architecture AI — Samba-1 trillion-param MoE, enterprise deployments", "ai-platform", ["dataflow","trillion","enterprise"]),
    ("Cerebras", "cerebras.ai", "Wafer-scale engine — largest single chip, fastest inference on large models", "ai-platform", ["wafer-scale","fast","chip"]),
    ("Perplexity", "perplexity.ai", "AI-powered search engine — LLM + web search, citations, grounded answers", "ai-product", ["search","citations","grounded"]),
    ("Character.AI", "character.ai", "Personality-driven chat — user-created characters, massive scale consumer AI", "ai-product", ["characters","consumer","chat"]),
    ("Replicate", "replicate.com", "ML model API marketplace — thousands of models accessible via API, push-to-deploy", "ai-platform", ["marketplace","api","deploy"]),
    ("RunPod", "runpod.io", "Spot + dedicated GPU cloud — competitive pricing, serverless endpoints", "ai-platform", ["gpu-cloud","spot","serverless"]),
    ("Lambda Labs", "lambdalabs.com", "GPU cloud specializing in deep learning — H100 + A100 at competitive rates", "ai-platform", ["gpu-cloud","h100","ml-focused"]),
    ("Vast.ai", "vast.ai", "GPU marketplace — peer-to-peer GPU rental, lowest prices for non-production workloads", "ai-platform", ["marketplace","p2p","cheap"]),
    ("CoreWeave", "coreweave.com", "AI cloud infrastructure — GPU clusters at scale, biggest H100 operator", "ai-platform", ["infrastructure","h100","scale"]),
    ("Scale AI", "scale.com", "Data labeling + AI training — enterprise/defense data ops, custom models", "ai-platform", ["labeling","enterprise","defense"]),
    ("Snorkel AI", "snorkel.ai", "Programmatic data labeling — weak supervision, enterprise LLM customization", "ai-platform", ["weak-supervision","enterprise","labeling"]),
    ("Weights & Biases", "wandb.ai", "ML experiment tracking — runs, sweeps, artifacts, reports, de facto MLOps", "mlops-tool", ["tracking","experiments","collaboration"]),
    ("MLflow", "mlflow.org", "OSS ML lifecycle — tracking, projects, models, registry, Databricks-backed", "mlops-tool", ["open","lifecycle","databricks"]),
    ("Kubeflow", "kubeflow.org", "ML on Kubernetes — pipelines, training, serving, distributed jobs", "mlops-tool", ["k8s","pipelines","distributed"]),
    ("ZenML", "zenml.io", "Framework-agnostic MLOps — define pipelines once, run anywhere, open source", "mlops-tool", ["open","pipelines","portable"]),
    ("Metaflow", "metaflow.org", "Netflix's ML workflow — human-centric, scales from laptop to cloud", "mlops-tool", ["netflix","human","scale"]),
    ("Flyte", "flyte.org", "Kubernetes-native workflow orchestration — ML/data pipelines, strong typing", "mlops-tool", ["k8s","typed","ml-pipelines"]),
    ("Prefect", "prefect.io", "Modern workflow orchestration — Python-first, dynamic DAGs, observability", "mlops-tool", ["python","dynamic","observability"]),
    ("Airflow", "airflow.apache.org", "Apache Airflow — de facto data pipeline orchestration, batch workflows", "mlops-tool", ["apache","batch","de-facto"]),
    ("Dagster", "dagster.io", "Asset-based data orchestrator — typed I/O, testing-first, modern alternative to Airflow", "mlops-tool", ["assets","typed","modern"]),
    ("dbt", "getdbt.com", "Data transformation tool — SQL-based, tests, docs, de facto for analytics engineering", "data-tool", ["sql","transformation","analytics"]),
    ("DuckDB", "duckdb.org", "In-process analytical DB — SQLite for analytics, blazing fast for local data", "data-tool", ["in-process","analytics","fast"]),
    ("Polars", "pola.rs", "Rust DataFrame library — 10x faster than pandas, lazy evaluation, Arrow-backed", "data-tool", ["rust","fast","arrow"]),
    ("Daft", "getdaft.io", "Distributed Python DataFrame — complex data (images/tensors), Ray-backed", "data-tool", ["python","distributed","tensors"]),
    ("Ibis", "ibis-project.org", "Python DataFrame API over 20+ SQL backends — write once, run on DuckDB/Snowflake/etc", "data-tool", ["portable","sql","python"]),
    ("LanceFormat", "lancedb.com/lance", "Columnar format for ML data — faster than parquet, zero-copy, versioned", "data-format", ["columnar","ml","versioned"]),
    ("Apache Arrow", "arrow.apache.org", "In-memory columnar format — interoperable, zero-copy, underpins Polars/DuckDB", "data-format", ["columnar","memory","interop"]),
    ("Parquet", "parquet.apache.org", "Columnar storage format — de facto for analytics, compressed, predicate pushdown", "data-format", ["columnar","analytics","classic"]),
    ("JSONL", "jsonlines.org", "JSON Lines — one JSON object per line, streaming-friendly, de facto for LLM data", "data-format", ["streaming","llm","simple"]),
    ("Safetensors", "github.com/huggingface/safetensors", "HuggingFace safe tensor format — zero-copy, secure (no arbitrary code), fast load", "ml-format", ["secure","fast","huggingface"]),
    ("GGUF", "huggingface.co/docs/hub/gguf", "llama.cpp quantized model format — standard for quantized OSS LLMs on CPU", "ml-format", ["quantized","cpu","llama-cpp"]),
    ("ONNX", "onnx.ai", "Open Neural Network Exchange — interoperable model format across frameworks", "ml-format", ["interop","runtime","standard"]),
    ("ExecutorCH", "pytorch.org/executorch", "PyTorch's on-device runtime — mobile/embedded LLM inference, succession of mobile-torch", "ml-runtime", ["pytorch","mobile","embedded"]),

    # Agent/tool-use libraries
    ("LangGraph", "github.com/langchain-ai/langgraph", "LangChain graph-based agents — stateful, cyclic, production-grade agent orchestration", "agent-framework", ["graph","cyclic","production"]),
    ("Pregel", "arxiv.org/abs/2307.09793", "Graph-based agent execution model — LangGraph's theoretical foundation", "agent-pattern", ["graph","execution","theory"]),
    ("BabyAGI-classic", "github.com/yoheinakajima/babyagi", "Classic autonomous task-loop — historical pioneer of fully autonomous agents", "agent-pattern", ["historical","task-loop","pioneer"]),
    ("SmolAgents", "github.com/huggingface/smolagents", "HuggingFace minimalist agent framework — code-first, simple, composable", "agent-framework", ["huggingface","minimal","code-agent"]),
    ("FunctionAgents", "docs.cohere.com/docs/tool-use", "Cohere tool-use framework — structured function calling, multi-step planning", "agent-framework", ["cohere","tools","planning"]),
    ("Swarm by OpenAI", "github.com/openai/swarm", "OpenAI's educational multi-agent framework — handoffs, routines, minimalist", "agent-framework", ["openai","education","handoffs"]),
    ("Agent Protocol", "agentprotocol.ai", "Standard HTTP API for interoperable AI agents — Division of Labor collaboration", "protocol", ["standard","http","interop"]),
    ("AutoGPT-Classic", "github.com/Significant-Gravitas/AutoGPT", "Original autonomous GPT — spawned the agent hype cycle of early 2023", "agent-framework", ["historical","autonomous","original"]),

    # RAG-specific tools
    ("Unstructured", "unstructured.io", "Document parsing for RAG — PDF/Word/HTML/PPT/image into structured text", "rag-tool", ["parsing","documents","rag-prep"]),
    ("LlamaParse", "cloud.llamaindex.ai/parse", "LlamaIndex's commercial parser — complex PDFs with tables, GenAI-based extraction", "rag-tool", ["llamaindex","pdf","tables"]),
    ("Docling", "github.com/DS4SD/docling", "IBM's document conversion — PDF→markdown with structure preservation, open source", "rag-tool", ["ibm","pdf","markdown"]),
    ("MarkItDown", "github.com/microsoft/markitdown", "Microsoft's any-to-markdown — DOCX/PPTX/XLSX/PDF/images → markdown for LLM", "rag-tool", ["microsoft","universal","markdown"]),
    ("PyMuPDF", "github.com/pymupdf/PyMuPDF", "Fast Python PDF library — extract text/tables/images, most reliable OSS parser", "rag-tool", ["python","pdf","reliable"]),
    ("Tesseract", "github.com/tesseract-ocr/tesseract", "Google's OCR engine — 100+ languages, classic OSS OCR", "rag-tool", ["google","ocr","classic"]),
    ("Surya OCR", "github.com/VikParuchuri/surya", "Modern OCR + layout analysis — 90+ languages, multilingual strong, open source", "rag-tool", ["modern","multilingual","layout"]),
    ("Chunking strategies", "llamahub.ai", "Text chunking approaches — fixed-size, semantic, hierarchical, document-based", "rag-pattern", ["chunking","preprocessing","rag"]),
    ("Semantic chunking", "arxiv.org/abs/2402.03216", "Embedding-based segmentation — boundaries where meaning shifts, better than fixed-size", "rag-pattern", ["semantic","embeddings","smart"]),
    ("Recursive chunking", "langchain.com/docs/modules/data_connection/document_transformers", "Hierarchical text splitting — paragraph → sentence → word, preserves structure", "rag-pattern", ["hierarchical","structure","preserve"]),

    # Model families / sizes (additional)
    ("Gemini 2.0 Flash", "deepmind.google/technologies/gemini", "Google Gemini 2.0 Flash — fast multimodal, native tool use, real-time API", "llm-model", ["google","fast","multimodal"]),
    ("GPT-4o", "openai.com/gpt-4", "OpenAI's omni model — voice+vision+text native, realtime API, industry benchmark", "llm-model", ["openai","omni","realtime"]),
    ("o1 / o3", "openai.com/o1", "OpenAI's reasoning models — extended thinking, strong math/code/science", "llm-model", ["openai","reasoning","thinking"]),
    ("Claude 3.5 Sonnet", "anthropic.com/claude", "Anthropic's frontier — coding excellence, long context, nuanced reasoning", "llm-model", ["anthropic","coding","frontier"]),
    ("Claude 3.5 Haiku", "anthropic.com/claude", "Anthropic's fast tier — cheap, fast, surprisingly capable for most tasks", "llm-model", ["anthropic","fast","cheap"]),
    ("Claude 3 Opus", "anthropic.com/claude", "Anthropic's flagship — most intelligent Claude, best for complex analysis", "llm-model", ["anthropic","opus","analysis"]),
    ("Gemini Ultra", "gemini.google.com", "Google's most capable — multimodal native, long context", "llm-model", ["google","ultra","multimodal"]),
    ("Grok 2", "x.ai/grok", "xAI's LLM — X-integrated, real-time info, image generation via Flux", "llm-model", ["xai","realtime","twitter"]),

    # Web scraping / search
    ("Playwright", "playwright.dev", "Microsoft's modern E2E testing — Chromium/Firefox/WebKit, auto-wait, best for agents", "automation-tool", ["microsoft","e2e","browser"]),
    ("Puppeteer", "pptr.dev", "Google's headless Chrome control — scraping, PDFs, automation", "automation-tool", ["google","chrome","headless"]),
    ("Selenium", "selenium.dev", "Classic browser automation — oldest E2E tool, WebDriver protocol", "automation-tool", ["classic","webdriver","multi-lang"]),
    ("Scrapy", "scrapy.org", "Python web crawling framework — asynchronous, distributed, de facto for scraping", "scraping-tool", ["python","async","de-facto"]),
    ("Firecrawl", "firecrawl.dev", "AI-friendly web crawler — JSON extraction, LLM-ready markdown output", "scraping-tool", ["llm-friendly","markdown","ai"]),
    ("Jina Reader", "jina.ai/reader", "URL → LLM-ready markdown — free API, removes ads/navigation, readable extracts", "scraping-tool", ["jina","free","markdown"]),
    ("Exa Search", "exa.ai", "Neural search API for AI — semantic web search, content extraction, LLM-optimized", "search-tool", ["neural","api","llm-ready"]),
    ("Tavily", "tavily.com", "AI search engine API — tailored for agents, grounded answers, RAG-ready", "search-tool", ["agent-focused","grounded","api"]),
    ("SerpAPI", "serpapi.com", "Google/Bing/etc search results as JSON — most reliable search scraper API", "search-tool", ["google","reliable","json"]),
    ("Brave Search API", "brave.com/search/api", "Privacy-first search API — independent index, transparent, good for AI apps", "search-tool", ["privacy","independent","transparent"]),
    ("Searxng", "docs.searxng.org", "OSS metasearch — self-hosted, combines Google/Bing/etc, free", "search-tool", ["oss","metasearch","privacy"]),

    # Dev tools
    ("VSCode", "code.visualstudio.com", "Microsoft's editor — most popular dev IDE, huge extension ecosystem", "dev-tool", ["microsoft","popular","extensions"]),
    ("Neovim", "neovim.io", "Modern Vim fork — Lua config, LSP, treesitter, perf focus", "dev-tool", ["vim","lua","terminal"]),
    ("Zed", "zed.dev", "High-performance collaborative editor — Rust, GPU-accelerated, multi-buffer", "dev-tool", ["rust","fast","collaborative"]),
    ("Git", "git-scm.com", "Distributed VCS — Linus's creation, de facto version control", "dev-tool", ["vcs","linus","de-facto"]),
    ("GitHub", "github.com", "Git hosting + collaboration — Microsoft-owned, de facto open source home", "dev-platform", ["microsoft","git","open-source"]),
    ("GitLab", "gitlab.com", "Self-hostable DevOps platform — CI/CD + registry + planning, open core", "dev-platform", ["self-host","devops","open-core"]),
    ("Gitea", "gitea.com", "Lightweight self-hosted git — Go-based, minimal resources, GitHub-like UI", "dev-platform", ["self-host","go","lightweight"]),
    ("Docker", "docker.com", "Containerization — de facto packaging, now pivoting post-Docker-era", "dev-tool", ["containers","packaging","de-facto"]),
    ("Kubernetes", "kubernetes.io", "Container orchestration — Google-born, CNCF, de facto cloud-native compute", "dev-platform", ["google","cncf","cloud-native"]),
    ("Terraform", "terraform.io", "HashiCorp IaC — declarative infrastructure, huge provider ecosystem", "devops-tool", ["hashicorp","iac","providers"]),
    ("Ansible", "ansible.com", "RedHat config management — agentless, YAML playbooks, classic ops tool", "devops-tool", ["redhat","agentless","yaml"]),
    ("NixOS / Nix", "nixos.org", "Functional package manager + OS — reproducible builds, declarative", "devops-tool", ["functional","reproducible","declarative"]),

    # Protocols / Standards
    ("REST", "restfulapi.net", "REST architecture style — stateless HTTP APIs, de facto web API standard", "protocol", ["http","classic","de-facto"]),
    ("GraphQL", "graphql.org", "Facebook's query language for APIs — clients specify shape, single endpoint", "protocol", ["facebook","query","typed"]),
    ("gRPC", "grpc.io", "Google's RPC framework — Protobuf, HTTP/2, streaming, microservices", "protocol", ["google","protobuf","streaming"]),
    ("WebSocket", "websockets.spec.whatwg.org", "Bidirectional TCP over HTTP — real-time, used for chat/collaboration", "protocol", ["bidirectional","realtime","http-based"]),
    ("Server-Sent Events", "html.spec.whatwg.org/multipage/server-sent-events.html", "SSE — server-push over HTTP, one-way streaming, used for LLM streaming responses", "protocol", ["streaming","llm-use","simple"]),
    ("OpenAPI", "openapis.org", "API specification standard — v3.x (Swagger v2 successor), industry-wide", "protocol", ["swagger","spec","standard"]),
    ("JSON Schema", "json-schema.org", "Data validation schema for JSON — widely used for LLM structured output", "protocol", ["validation","json","llm-use"]),
    ("OAuth 2.0", "oauth.net/2", "Authorization framework — de facto standard for third-party access tokens", "protocol", ["auth","de-facto","tokens"]),
    ("OpenID Connect", "openid.net/connect", "Identity layer on OAuth 2.0 — SSO via JWT, SAML successor", "protocol", ["sso","jwt","identity"]),
    ("JWT", "jwt.io", "JSON Web Tokens — self-contained signed tokens, used for auth sessions", "protocol", ["tokens","signed","self-contained"]),

    # Languages / runtimes
    ("Python", "python.org", "Dynamic language — #1 for AI/ML/data, massive ecosystem, beginner-friendly", "language", ["ai-ml","ecosystem","popular"]),
    ("TypeScript", "typescriptlang.org", "Microsoft's typed JavaScript — de facto for modern web dev, huge adoption", "language", ["microsoft","typed","web"]),
    ("Rust", "rust-lang.org", "Systems language — memory-safe without GC, growing for AI infra (tokenizers, candle)", "language", ["systems","safe","growing"]),
    ("Go", "go.dev", "Google's language — simple, fast, concurrent, de facto for cloud infrastructure", "language", ["google","cloud","simple"]),
    ("JavaScript", "ecma-international.org", "Original web language — ECMAScript standard, universal in browsers", "language", ["web","universal","original"]),
    ("SQL", "iso.org/standard/63555.html", "Declarative data query language — decades-old, irreplaceable for analytics", "language", ["data","declarative","classic"]),
    ("Lua", "lua.org", "Lightweight embedded scripting — Neovim config, game scripting, fast VM", "language", ["embedded","fast","game"]),
    ("Julia", "julialang.org", "Scientific computing language — Python-ease + C-speed, strong numerics", "language", ["scientific","fast","numerical"]),
    ("C/C++", "isocpp.org", "Systems programming classics — LLM inference (llama.cpp), game engines, OS", "language", ["systems","fast","classic"]),
    ("Zig", "ziglang.org", "Modern C replacement — explicit allocators, comptime, growing for infra", "language", ["c-replacement","modern","systems"]),

    # Misc popular
    ("Hugging Face Spaces", "huggingface.co/spaces", "Free AI app hosting — Gradio/Streamlit/Docker apps, community showcase", "ai-platform", ["hosting","free","demos"]),
    ("Kaggle", "kaggle.com", "ML competition + datasets + notebooks — Google-owned, data science community", "ai-platform", ["google","competitions","community"]),
    ("Papers With Code", "paperswithcode.com", "ML papers + code + leaderboards — benchmarks, reproducibility focus", "research-tool", ["papers","benchmarks","reproducible"]),
    ("arXiv", "arxiv.org", "Preprint server — CS/ML papers published here first, open access", "research-platform", ["preprint","open-access","cs"]),
    ("SemanticScholar", "semanticscholar.org", "AI-powered academic search — citation graph, influential papers, AI2", "research-tool", ["allenai","search","citations"]),
    ("Google Colab", "colab.research.google.com", "Free Jupyter notebooks with GPUs — ML prototyping, sharing, education", "dev-platform", ["google","jupyter","gpu-free"]),
    ("Jupyter", "jupyter.org", "Interactive notebook environment — de facto for data science, Python/R/Julia", "dev-tool", ["notebooks","data-science","de-facto"]),
    ("Paperspace Gradient", "paperspace.com", "ML platform — notebooks, GPU workloads, deployment, DigitalOcean-owned", "ai-platform", ["ml","gpu","notebooks"]),

    # More recent trends
    ("RLAIF", "arxiv.org/abs/2212.08073", "Reinforcement Learning from AI Feedback — Anthropic's Constitutional AI uses this", "alignment-method", ["anthropic","ai-feedback","constitutional"]),
    ("Constitutional AI", "arxiv.org/abs/2212.08073", "Anthropic's alignment method — self-critique via constitution, scales beyond human feedback", "alignment-method", ["anthropic","self-critique","scalable"]),
    ("DORA: Weight Decomposition", "arxiv.org/abs/2402.09353", "DoRA fine-tuning — decompose weights into magnitude + direction for better LoRA", "finetuning-method", ["decomposition","lora-plus"]),
    ("Mamba / SSM", "arxiv.org/abs/2312.00752", "State Space Models — linear scaling vs Transformer's quadratic, Mamba architecture", "architecture", ["ssm","linear","alternative"]),
    ("Jamba", "arxiv.org/abs/2403.19887", "AI21's Mamba-Transformer hybrid — best of both, efficient long context", "architecture", ["ai21","hybrid","long-context"]),
    ("MoE (Mixture of Experts)", "arxiv.org/abs/2401.04088", "Sparse activation — only K of N experts per token, trillion-param models feasible", "architecture", ["sparse","efficient","scaling"]),
    ("Speculative Decoding", "arxiv.org/abs/2211.17192", "Small model drafts, large verifies — 2-3x faster inference, same output distribution", "inference-method", ["faster","parallel","same-output"]),
    ("Medusa heads", "arxiv.org/abs/2401.10774", "Multiple decoding heads for speculative — no draft model needed, 2x speedup", "inference-method", ["multi-head","fast","no-draft"]),
    ("KV Cache", "en.wikipedia.org/wiki/Transformer_(machine_learning_model)", "Transformer attention caching — reuse past K/V tensors, critical for long-gen inference", "inference-optim", ["caching","attention","critical"]),
    ("Flash Attention", "arxiv.org/abs/2205.14135", "IO-aware attention — 2-4x faster, memory-efficient, standard for training/inference", "inference-optim", ["io-aware","fast","memory"]),
    ("Flash Attention 3", "arxiv.org/abs/2407.08608", "Latest Flash Attention — H100-optimized, 1.5-2x faster than FA2", "inference-optim", ["h100","latest","fast"]),
    ("PagedAttention", "arxiv.org/abs/2309.06180", "vLLM's KV cache paging — OS virtual memory for LLMs, enables high throughput batching", "inference-optim", ["vllm","paging","throughput"]),
    ("Continuous Batching", "arxiv.org/abs/2307.01237", "Dynamic batching that adds/removes requests mid-batch — 23x throughput vs static", "inference-optim", ["dynamic","throughput","vllm"]),
    ("Sliding Window Attention", "arxiv.org/abs/2310.06825", "Mistral's attention pattern — fixed window, long sequences with constant memory", "architecture", ["mistral","window","memory"]),
    ("RoPE (Rotary Position Embedding)", "arxiv.org/abs/2104.09864", "Rotary positional encoding — Llama/Qwen/Mistral standard, extensible context", "architecture", ["positional","rotary","extensible"]),
    ("YaRN", "arxiv.org/abs/2309.00071", "RoPE context extension — interpolation + attention scaling, 128k+ contexts", "architecture", ["rope","context","extension"]),

    # AI safety
    ("Red teaming", "arxiv.org/abs/2209.07858", "Adversarial probing of AI systems — find jailbreaks, harmful outputs, failure modes", "safety-practice", ["adversarial","safety","probing"]),
    ("Jailbreak", "wikipedia.org/wiki/Jailbreaking_(language_models)", "Bypass AI safety — prompt engineering to elicit restricted behavior", "safety-concept", ["attack","prompt","safety"]),
    ("Prompt injection", "owasp.org/www-project-top-10-for-large-language-model-applications", "LLM equivalent of SQL injection — malicious instructions in data fed to LLM", "safety-concept", ["attack","security","owasp"]),
    ("Data poisoning", "arxiv.org/abs/2302.10149", "Malicious training data to corrupt model — especially dangerous for fine-tuning", "safety-concept", ["attack","training","corrupt"]),
    ("Model extraction", "arxiv.org/abs/1609.02943", "Steal model via queries — reconstruct parameters or distill behavior", "safety-concept", ["attack","stealing","queries"]),

    # Final misc batch
    ("Tiktoken", "github.com/openai/tiktoken", "OpenAI's tokenizer library — BPE tokenization for GPT models, de facto for token counting", "llm-tool", ["openai","tokenizer","bpe"]),
    ("SentencePiece", "github.com/google/sentencepiece", "Google's tokenizer — Llama/Mistral/Qwen use this, BPE/Unigram support", "llm-tool", ["google","tokenizer","bpe"]),
    ("Tokenizers (HF)", "github.com/huggingface/tokenizers", "HuggingFace Rust tokenizers — 1M+ tok/sec, all major algorithms", "llm-tool", ["huggingface","rust","fast"]),
]

def deterministic_id(idx, variation_idx):
    h = hashlib.sha256(f"weval-oss-b2-p{idx}-v{variation_idx}".encode()).digest()
    return ID_OFFSET + int.from_bytes(h[:6], 'big')

def make_variations(title, source, desc, category, tags):
    tags_str = " ".join(tags)
    variations = [
        title,
        f"{title}: {desc}",
        f"{title} — {category}",
        f"{title} [{tags_str}]",
        f"What is {title}? {desc}",
        f"{category}: {title}",
        f"{title} from {source}",
        f"{title} description: {desc[:100]}",
        f"OSS pattern: {title} — tags: {', '.join(tags[:3])}",
        f"{title} ({source}): {desc[:80]}",
    ]
    return [v for v in variations if v and len(v) > 5]

def qdrant_upsert_batch(points):
    data = json.dumps({"points": points}).encode()
    req = urllib.request.Request(
        f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
        data=data, headers={"Content-Type": "application/json"}, method="PUT"
    )
    with urllib.request.urlopen(req, timeout=30) as r:
        return json.loads(r.read())

def main():
    from sentence_transformers import SentenceTransformer
    print(f"Loaded {len(EXTRA)} additional OSS patterns")
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

    all_texts, all_ids, all_payloads = [], [], []
    for p_idx, (title, source, desc, category, tags) in enumerate(EXTRA):
        for v_idx, text in enumerate(make_variations(title, source, desc, category, tags)):
            all_texts.append(text)
            all_ids.append(deterministic_id(p_idx, v_idx))
            all_payloads.append({
                "title": title, "source": source, "description": desc,
                "category": category, "tags": tags,
                "pattern_idx": p_idx, "variation_idx": v_idx,
                "text": text[:500], "ingested_at": datetime.now().isoformat(),
                "ingest_source": "ingest-oss-skills-batch2",
            })

    print(f"Total variations: {len(all_texts)}")
    t0 = time.time()
    embeddings = model.encode(all_texts, batch_size=64, show_progress_bar=False, convert_to_numpy=True)
    print(f"Embeddings done in {round(time.time()-t0,1)}s")

    total = 0
    for i in range(0, len(all_texts), BATCH_SIZE):
        pts = [{"id": all_ids[j], "vector": embeddings[j].tolist(), "payload": all_payloads[j]}
               for j in range(i, min(i+BATCH_SIZE, len(all_texts)))]
        try:
            r = qdrant_upsert_batch(pts)
            total += len(pts)
            print(f"  batch {i//BATCH_SIZE+1}: upserted {len(pts)} (total={total}) {r.get('result',{}).get('status')}")
        except Exception as e:
            print(f"  batch FAIL: {e}")

    req = urllib.request.Request(f"{QDRANT_URL}/collections/{COLLECTION}", method="GET")
    with urllib.request.urlopen(req, timeout=10) as r:
        d = json.loads(r.read())
    print(f"\n═══ DONE · Ingested {total} · weval_skills now {d['result']['points_count']} points ═══")
    return 0

if __name__ == "__main__":
    sys.exit(main())