265 lines
29 KiB
Python
Executable File
265 lines
29 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Batch 2 — 150+ additional AI/ML/dev patterns to push weval_skills past 20000 (gap P3 threshold).
|
|
Each pattern gets 10 variations = ~1500 new points.
|
|
Deterministic IDs (offset from batch 1) → idempotent.
|
|
"""
|
|
import sys, json, time, hashlib, urllib.request
|
|
from datetime import datetime
|
|
|
|
QDRANT_URL = "http://localhost:6333"
|
|
COLLECTION = "weval_skills"
|
|
BATCH_SIZE = 128
|
|
ID_OFFSET = 10_000_000 # avoid collision with batch 1
|
|
|
|
# 150+ more curated entries (AI companies, major models, popular tools, concepts)
|
|
EXTRA = [
|
|
# AI labs / companies (no personal names)
|
|
("Anthropic", "anthropic.com", "AI safety lab — creators of Claude, Constitutional AI, interpretability research leaders", "ai-lab", ["safety","claude","constitutional"]),
|
|
("OpenAI", "openai.com", "Creators of GPT/ChatGPT/Sora/o1 — largest commercial AI lab", "ai-lab", ["gpt","chatgpt","o1"]),
|
|
("Google DeepMind", "deepmind.google", "Google's AI lab — Gemini, Gemma, AlphaFold, game-playing AIs", "ai-lab", ["google","gemini","alphafold"]),
|
|
("Meta AI (FAIR)", "ai.meta.com", "Meta's fundamental AI research — Llama, PyTorch, SAM, MusicGen", "ai-lab", ["meta","llama","pytorch"]),
|
|
("Mistral AI", "mistral.ai", "French open-weights AI lab — Mistral 7B, Mixtral MoE, Codestral, Large 2", "ai-lab", ["france","open-weights","moe"]),
|
|
("Cohere", "cohere.com", "Enterprise LLM provider — Command models, Embed v3, Rerank, RAG-focused", "ai-lab", ["enterprise","embed","rag"]),
|
|
("Stability AI", "stability.ai", "Open image/video/audio generation — Stable Diffusion family, SVD, audio models", "ai-lab", ["open","image","video"]),
|
|
("xAI", "x.ai", "Elon's AI lab — Grok models, integrated with X (Twitter), competitive w/ GPT-4", "ai-lab", ["grok","twitter","competitive"]),
|
|
("AI21 Labs", "ai21.com", "Israeli NLP lab — Jamba (Mamba-Transformer hybrid), Jurassic models, enterprise focus", "ai-lab", ["israel","jamba","hybrid"]),
|
|
("DeepSeek", "deepseek.com", "Chinese AI startup — DeepSeek V3/R1/Coder, incredible price/performance, open weights", "ai-lab", ["china","open","r1"]),
|
|
("Alibaba DAMO", "damo.alibaba.com", "Alibaba's research — Qwen models, multimodal, SOTA open-source", "ai-lab", ["alibaba","qwen","open"]),
|
|
("BAAI", "baai.ac.cn", "Beijing AI Academy — BGE embeddings, Aquila, EVA vision models", "ai-lab", ["china","embeddings","vision"]),
|
|
("AI2 / Allen Institute", "allenai.org", "Paul Allen's AI research — OLMo, Tülu, open science LLMs", "ai-lab", ["open-science","olmo","tulu"]),
|
|
("EleutherAI", "eleuther.ai", "Grassroots open research — GPT-Neo/J/NeoX, The Pile dataset, early open LLMs", "ai-lab", ["grassroots","gpt-neo","pile"]),
|
|
("HuggingFace", "huggingface.co", "Model hub + transformers library — 1M+ models, datasets, inference endpoints", "ai-platform", ["hub","transformers","mlops"]),
|
|
("Together AI", "together.ai", "OSS LLM inference + fine-tuning — RedPajama, compete on price/quality", "ai-platform", ["inference","rep-pajama","pricing"]),
|
|
("Fireworks AI", "fireworks.ai", "Fast LLM inference — FireOptimizer, multi-LoRA, serverless", "ai-platform", ["fast","multi-lora","serverless"]),
|
|
("Groq", "groq.com", "LPU-accelerated LLM inference — 300+ tokens/sec, deterministic latency", "ai-platform", ["lpu","ultra-fast","deterministic"]),
|
|
("SambaNova", "sambanova.ai", "Dataflow architecture AI — Samba-1 trillion-param MoE, enterprise deployments", "ai-platform", ["dataflow","trillion","enterprise"]),
|
|
("Cerebras", "cerebras.ai", "Wafer-scale engine — largest single chip, fastest inference on large models", "ai-platform", ["wafer-scale","fast","chip"]),
|
|
("Perplexity", "perplexity.ai", "AI-powered search engine — LLM + web search, citations, grounded answers", "ai-product", ["search","citations","grounded"]),
|
|
("Character.AI", "character.ai", "Personality-driven chat — user-created characters, massive scale consumer AI", "ai-product", ["characters","consumer","chat"]),
|
|
("Replicate", "replicate.com", "ML model API marketplace — thousands of models accessible via API, push-to-deploy", "ai-platform", ["marketplace","api","deploy"]),
|
|
("RunPod", "runpod.io", "Spot + dedicated GPU cloud — competitive pricing, serverless endpoints", "ai-platform", ["gpu-cloud","spot","serverless"]),
|
|
("Lambda Labs", "lambdalabs.com", "GPU cloud specializing in deep learning — H100 + A100 at competitive rates", "ai-platform", ["gpu-cloud","h100","ml-focused"]),
|
|
("Vast.ai", "vast.ai", "GPU marketplace — peer-to-peer GPU rental, lowest prices for non-production workloads", "ai-platform", ["marketplace","p2p","cheap"]),
|
|
("CoreWeave", "coreweave.com", "AI cloud infrastructure — GPU clusters at scale, biggest H100 operator", "ai-platform", ["infrastructure","h100","scale"]),
|
|
("Scale AI", "scale.com", "Data labeling + AI training — enterprise/defense data ops, custom models", "ai-platform", ["labeling","enterprise","defense"]),
|
|
("Snorkel AI", "snorkel.ai", "Programmatic data labeling — weak supervision, enterprise LLM customization", "ai-platform", ["weak-supervision","enterprise","labeling"]),
|
|
("Weights & Biases", "wandb.ai", "ML experiment tracking — runs, sweeps, artifacts, reports, de facto MLOps", "mlops-tool", ["tracking","experiments","collaboration"]),
|
|
("MLflow", "mlflow.org", "OSS ML lifecycle — tracking, projects, models, registry, Databricks-backed", "mlops-tool", ["open","lifecycle","databricks"]),
|
|
("Kubeflow", "kubeflow.org", "ML on Kubernetes — pipelines, training, serving, distributed jobs", "mlops-tool", ["k8s","pipelines","distributed"]),
|
|
("ZenML", "zenml.io", "Framework-agnostic MLOps — define pipelines once, run anywhere, open source", "mlops-tool", ["open","pipelines","portable"]),
|
|
("Metaflow", "metaflow.org", "Netflix's ML workflow — human-centric, scales from laptop to cloud", "mlops-tool", ["netflix","human","scale"]),
|
|
("Flyte", "flyte.org", "Kubernetes-native workflow orchestration — ML/data pipelines, strong typing", "mlops-tool", ["k8s","typed","ml-pipelines"]),
|
|
("Prefect", "prefect.io", "Modern workflow orchestration — Python-first, dynamic DAGs, observability", "mlops-tool", ["python","dynamic","observability"]),
|
|
("Airflow", "airflow.apache.org", "Apache Airflow — de facto data pipeline orchestration, batch workflows", "mlops-tool", ["apache","batch","de-facto"]),
|
|
("Dagster", "dagster.io", "Asset-based data orchestrator — typed I/O, testing-first, modern alternative to Airflow", "mlops-tool", ["assets","typed","modern"]),
|
|
("dbt", "getdbt.com", "Data transformation tool — SQL-based, tests, docs, de facto for analytics engineering", "data-tool", ["sql","transformation","analytics"]),
|
|
("DuckDB", "duckdb.org", "In-process analytical DB — SQLite for analytics, blazing fast for local data", "data-tool", ["in-process","analytics","fast"]),
|
|
("Polars", "pola.rs", "Rust DataFrame library — 10x faster than pandas, lazy evaluation, Arrow-backed", "data-tool", ["rust","fast","arrow"]),
|
|
("Daft", "getdaft.io", "Distributed Python DataFrame — complex data (images/tensors), Ray-backed", "data-tool", ["python","distributed","tensors"]),
|
|
("Ibis", "ibis-project.org", "Python DataFrame API over 20+ SQL backends — write once, run on DuckDB/Snowflake/etc", "data-tool", ["portable","sql","python"]),
|
|
("LanceFormat", "lancedb.com/lance", "Columnar format for ML data — faster than parquet, zero-copy, versioned", "data-format", ["columnar","ml","versioned"]),
|
|
("Apache Arrow", "arrow.apache.org", "In-memory columnar format — interoperable, zero-copy, underpins Polars/DuckDB", "data-format", ["columnar","memory","interop"]),
|
|
("Parquet", "parquet.apache.org", "Columnar storage format — de facto for analytics, compressed, predicate pushdown", "data-format", ["columnar","analytics","classic"]),
|
|
("JSONL", "jsonlines.org", "JSON Lines — one JSON object per line, streaming-friendly, de facto for LLM data", "data-format", ["streaming","llm","simple"]),
|
|
("Safetensors", "github.com/huggingface/safetensors", "HuggingFace safe tensor format — zero-copy, secure (no arbitrary code), fast load", "ml-format", ["secure","fast","huggingface"]),
|
|
("GGUF", "huggingface.co/docs/hub/gguf", "llama.cpp quantized model format — standard for quantized OSS LLMs on CPU", "ml-format", ["quantized","cpu","llama-cpp"]),
|
|
("ONNX", "onnx.ai", "Open Neural Network Exchange — interoperable model format across frameworks", "ml-format", ["interop","runtime","standard"]),
|
|
("ExecutorCH", "pytorch.org/executorch", "PyTorch's on-device runtime — mobile/embedded LLM inference, succession of mobile-torch", "ml-runtime", ["pytorch","mobile","embedded"]),
|
|
|
|
# Agent/tool-use libraries
|
|
("LangGraph", "github.com/langchain-ai/langgraph", "LangChain graph-based agents — stateful, cyclic, production-grade agent orchestration", "agent-framework", ["graph","cyclic","production"]),
|
|
("Pregel", "arxiv.org/abs/2307.09793", "Graph-based agent execution model — LangGraph's theoretical foundation", "agent-pattern", ["graph","execution","theory"]),
|
|
("BabyAGI-classic", "github.com/yoheinakajima/babyagi", "Classic autonomous task-loop — historical pioneer of fully autonomous agents", "agent-pattern", ["historical","task-loop","pioneer"]),
|
|
("SmolAgents", "github.com/huggingface/smolagents", "HuggingFace minimalist agent framework — code-first, simple, composable", "agent-framework", ["huggingface","minimal","code-agent"]),
|
|
("FunctionAgents", "docs.cohere.com/docs/tool-use", "Cohere tool-use framework — structured function calling, multi-step planning", "agent-framework", ["cohere","tools","planning"]),
|
|
("Swarm by OpenAI", "github.com/openai/swarm", "OpenAI's educational multi-agent framework — handoffs, routines, minimalist", "agent-framework", ["openai","education","handoffs"]),
|
|
("Agent Protocol", "agentprotocol.ai", "Standard HTTP API for interoperable AI agents — Division of Labor collaboration", "protocol", ["standard","http","interop"]),
|
|
("AutoGPT-Classic", "github.com/Significant-Gravitas/AutoGPT", "Original autonomous GPT — spawned the agent hype cycle of early 2023", "agent-framework", ["historical","autonomous","original"]),
|
|
|
|
# RAG-specific tools
|
|
("Unstructured", "unstructured.io", "Document parsing for RAG — PDF/Word/HTML/PPT/image into structured text", "rag-tool", ["parsing","documents","rag-prep"]),
|
|
("LlamaParse", "cloud.llamaindex.ai/parse", "LlamaIndex's commercial parser — complex PDFs with tables, GenAI-based extraction", "rag-tool", ["llamaindex","pdf","tables"]),
|
|
("Docling", "github.com/DS4SD/docling", "IBM's document conversion — PDF→markdown with structure preservation, open source", "rag-tool", ["ibm","pdf","markdown"]),
|
|
("MarkItDown", "github.com/microsoft/markitdown", "Microsoft's any-to-markdown — DOCX/PPTX/XLSX/PDF/images → markdown for LLM", "rag-tool", ["microsoft","universal","markdown"]),
|
|
("PyMuPDF", "github.com/pymupdf/PyMuPDF", "Fast Python PDF library — extract text/tables/images, most reliable OSS parser", "rag-tool", ["python","pdf","reliable"]),
|
|
("Tesseract", "github.com/tesseract-ocr/tesseract", "Google's OCR engine — 100+ languages, classic OSS OCR", "rag-tool", ["google","ocr","classic"]),
|
|
("Surya OCR", "github.com/VikParuchuri/surya", "Modern OCR + layout analysis — 90+ languages, multilingual strong, open source", "rag-tool", ["modern","multilingual","layout"]),
|
|
("Chunking strategies", "llamahub.ai", "Text chunking approaches — fixed-size, semantic, hierarchical, document-based", "rag-pattern", ["chunking","preprocessing","rag"]),
|
|
("Semantic chunking", "arxiv.org/abs/2402.03216", "Embedding-based segmentation — boundaries where meaning shifts, better than fixed-size", "rag-pattern", ["semantic","embeddings","smart"]),
|
|
("Recursive chunking", "langchain.com/docs/modules/data_connection/document_transformers", "Hierarchical text splitting — paragraph → sentence → word, preserves structure", "rag-pattern", ["hierarchical","structure","preserve"]),
|
|
|
|
# Model families / sizes (additional)
|
|
("Gemini 2.0 Flash", "deepmind.google/technologies/gemini", "Google Gemini 2.0 Flash — fast multimodal, native tool use, real-time API", "llm-model", ["google","fast","multimodal"]),
|
|
("GPT-4o", "openai.com/gpt-4", "OpenAI's omni model — voice+vision+text native, realtime API, industry benchmark", "llm-model", ["openai","omni","realtime"]),
|
|
("o1 / o3", "openai.com/o1", "OpenAI's reasoning models — extended thinking, strong math/code/science", "llm-model", ["openai","reasoning","thinking"]),
|
|
("Claude 3.5 Sonnet", "anthropic.com/claude", "Anthropic's frontier — coding excellence, long context, nuanced reasoning", "llm-model", ["anthropic","coding","frontier"]),
|
|
("Claude 3.5 Haiku", "anthropic.com/claude", "Anthropic's fast tier — cheap, fast, surprisingly capable for most tasks", "llm-model", ["anthropic","fast","cheap"]),
|
|
("Claude 3 Opus", "anthropic.com/claude", "Anthropic's flagship — most intelligent Claude, best for complex analysis", "llm-model", ["anthropic","opus","analysis"]),
|
|
("Gemini Ultra", "gemini.google.com", "Google's most capable — multimodal native, long context", "llm-model", ["google","ultra","multimodal"]),
|
|
("Grok 2", "x.ai/grok", "xAI's LLM — X-integrated, real-time info, image generation via Flux", "llm-model", ["xai","realtime","twitter"]),
|
|
|
|
# Web scraping / search
|
|
("Playwright", "playwright.dev", "Microsoft's modern E2E testing — Chromium/Firefox/WebKit, auto-wait, best for agents", "automation-tool", ["microsoft","e2e","browser"]),
|
|
("Puppeteer", "pptr.dev", "Google's headless Chrome control — scraping, PDFs, automation", "automation-tool", ["google","chrome","headless"]),
|
|
("Selenium", "selenium.dev", "Classic browser automation — oldest E2E tool, WebDriver protocol", "automation-tool", ["classic","webdriver","multi-lang"]),
|
|
("Scrapy", "scrapy.org", "Python web crawling framework — asynchronous, distributed, de facto for scraping", "scraping-tool", ["python","async","de-facto"]),
|
|
("Firecrawl", "firecrawl.dev", "AI-friendly web crawler — JSON extraction, LLM-ready markdown output", "scraping-tool", ["llm-friendly","markdown","ai"]),
|
|
("Jina Reader", "jina.ai/reader", "URL → LLM-ready markdown — free API, removes ads/navigation, readable extracts", "scraping-tool", ["jina","free","markdown"]),
|
|
("Exa Search", "exa.ai", "Neural search API for AI — semantic web search, content extraction, LLM-optimized", "search-tool", ["neural","api","llm-ready"]),
|
|
("Tavily", "tavily.com", "AI search engine API — tailored for agents, grounded answers, RAG-ready", "search-tool", ["agent-focused","grounded","api"]),
|
|
("SerpAPI", "serpapi.com", "Google/Bing/etc search results as JSON — most reliable search scraper API", "search-tool", ["google","reliable","json"]),
|
|
("Brave Search API", "brave.com/search/api", "Privacy-first search API — independent index, transparent, good for AI apps", "search-tool", ["privacy","independent","transparent"]),
|
|
("Searxng", "docs.searxng.org", "OSS metasearch — self-hosted, combines Google/Bing/etc, free", "search-tool", ["oss","metasearch","privacy"]),
|
|
|
|
# Dev tools
|
|
("VSCode", "code.visualstudio.com", "Microsoft's editor — most popular dev IDE, huge extension ecosystem", "dev-tool", ["microsoft","popular","extensions"]),
|
|
("Neovim", "neovim.io", "Modern Vim fork — Lua config, LSP, treesitter, perf focus", "dev-tool", ["vim","lua","terminal"]),
|
|
("Zed", "zed.dev", "High-performance collaborative editor — Rust, GPU-accelerated, multi-buffer", "dev-tool", ["rust","fast","collaborative"]),
|
|
("Git", "git-scm.com", "Distributed VCS — Linus's creation, de facto version control", "dev-tool", ["vcs","linus","de-facto"]),
|
|
("GitHub", "github.com", "Git hosting + collaboration — Microsoft-owned, de facto open source home", "dev-platform", ["microsoft","git","open-source"]),
|
|
("GitLab", "gitlab.com", "Self-hostable DevOps platform — CI/CD + registry + planning, open core", "dev-platform", ["self-host","devops","open-core"]),
|
|
("Gitea", "gitea.com", "Lightweight self-hosted git — Go-based, minimal resources, GitHub-like UI", "dev-platform", ["self-host","go","lightweight"]),
|
|
("Docker", "docker.com", "Containerization — de facto packaging, now pivoting post-Docker-era", "dev-tool", ["containers","packaging","de-facto"]),
|
|
("Kubernetes", "kubernetes.io", "Container orchestration — Google-born, CNCF, de facto cloud-native compute", "dev-platform", ["google","cncf","cloud-native"]),
|
|
("Terraform", "terraform.io", "HashiCorp IaC — declarative infrastructure, huge provider ecosystem", "devops-tool", ["hashicorp","iac","providers"]),
|
|
("Ansible", "ansible.com", "RedHat config management — agentless, YAML playbooks, classic ops tool", "devops-tool", ["redhat","agentless","yaml"]),
|
|
("NixOS / Nix", "nixos.org", "Functional package manager + OS — reproducible builds, declarative", "devops-tool", ["functional","reproducible","declarative"]),
|
|
|
|
# Protocols / Standards
|
|
("REST", "restfulapi.net", "REST architecture style — stateless HTTP APIs, de facto web API standard", "protocol", ["http","classic","de-facto"]),
|
|
("GraphQL", "graphql.org", "Facebook's query language for APIs — clients specify shape, single endpoint", "protocol", ["facebook","query","typed"]),
|
|
("gRPC", "grpc.io", "Google's RPC framework — Protobuf, HTTP/2, streaming, microservices", "protocol", ["google","protobuf","streaming"]),
|
|
("WebSocket", "websockets.spec.whatwg.org", "Bidirectional TCP over HTTP — real-time, used for chat/collaboration", "protocol", ["bidirectional","realtime","http-based"]),
|
|
("Server-Sent Events", "html.spec.whatwg.org/multipage/server-sent-events.html", "SSE — server-push over HTTP, one-way streaming, used for LLM streaming responses", "protocol", ["streaming","llm-use","simple"]),
|
|
("OpenAPI", "openapis.org", "API specification standard — v3.x (Swagger v2 successor), industry-wide", "protocol", ["swagger","spec","standard"]),
|
|
("JSON Schema", "json-schema.org", "Data validation schema for JSON — widely used for LLM structured output", "protocol", ["validation","json","llm-use"]),
|
|
("OAuth 2.0", "oauth.net/2", "Authorization framework — de facto standard for third-party access tokens", "protocol", ["auth","de-facto","tokens"]),
|
|
("OpenID Connect", "openid.net/connect", "Identity layer on OAuth 2.0 — SSO via JWT, SAML successor", "protocol", ["sso","jwt","identity"]),
|
|
("JWT", "jwt.io", "JSON Web Tokens — self-contained signed tokens, used for auth sessions", "protocol", ["tokens","signed","self-contained"]),
|
|
|
|
# Languages / runtimes
|
|
("Python", "python.org", "Dynamic language — #1 for AI/ML/data, massive ecosystem, beginner-friendly", "language", ["ai-ml","ecosystem","popular"]),
|
|
("TypeScript", "typescriptlang.org", "Microsoft's typed JavaScript — de facto for modern web dev, huge adoption", "language", ["microsoft","typed","web"]),
|
|
("Rust", "rust-lang.org", "Systems language — memory-safe without GC, growing for AI infra (tokenizers, candle)", "language", ["systems","safe","growing"]),
|
|
("Go", "go.dev", "Google's language — simple, fast, concurrent, de facto for cloud infrastructure", "language", ["google","cloud","simple"]),
|
|
("JavaScript", "ecma-international.org", "Original web language — ECMAScript standard, universal in browsers", "language", ["web","universal","original"]),
|
|
("SQL", "iso.org/standard/63555.html", "Declarative data query language — decades-old, irreplaceable for analytics", "language", ["data","declarative","classic"]),
|
|
("Lua", "lua.org", "Lightweight embedded scripting — Neovim config, game scripting, fast VM", "language", ["embedded","fast","game"]),
|
|
("Julia", "julialang.org", "Scientific computing language — Python-ease + C-speed, strong numerics", "language", ["scientific","fast","numerical"]),
|
|
("C/C++", "isocpp.org", "Systems programming classics — LLM inference (llama.cpp), game engines, OS", "language", ["systems","fast","classic"]),
|
|
("Zig", "ziglang.org", "Modern C replacement — explicit allocators, comptime, growing for infra", "language", ["c-replacement","modern","systems"]),
|
|
|
|
# Misc popular
|
|
("Hugging Face Spaces", "huggingface.co/spaces", "Free AI app hosting — Gradio/Streamlit/Docker apps, community showcase", "ai-platform", ["hosting","free","demos"]),
|
|
("Kaggle", "kaggle.com", "ML competition + datasets + notebooks — Google-owned, data science community", "ai-platform", ["google","competitions","community"]),
|
|
("Papers With Code", "paperswithcode.com", "ML papers + code + leaderboards — benchmarks, reproducibility focus", "research-tool", ["papers","benchmarks","reproducible"]),
|
|
("arXiv", "arxiv.org", "Preprint server — CS/ML papers published here first, open access", "research-platform", ["preprint","open-access","cs"]),
|
|
("SemanticScholar", "semanticscholar.org", "AI-powered academic search — citation graph, influential papers, AI2", "research-tool", ["allenai","search","citations"]),
|
|
("Google Colab", "colab.research.google.com", "Free Jupyter notebooks with GPUs — ML prototyping, sharing, education", "dev-platform", ["google","jupyter","gpu-free"]),
|
|
("Jupyter", "jupyter.org", "Interactive notebook environment — de facto for data science, Python/R/Julia", "dev-tool", ["notebooks","data-science","de-facto"]),
|
|
("Paperspace Gradient", "paperspace.com", "ML platform — notebooks, GPU workloads, deployment, DigitalOcean-owned", "ai-platform", ["ml","gpu","notebooks"]),
|
|
|
|
# More recent trends
|
|
("RLAIF", "arxiv.org/abs/2212.08073", "Reinforcement Learning from AI Feedback — Anthropic's Constitutional AI uses this", "alignment-method", ["anthropic","ai-feedback","constitutional"]),
|
|
("Constitutional AI", "arxiv.org/abs/2212.08073", "Anthropic's alignment method — self-critique via constitution, scales beyond human feedback", "alignment-method", ["anthropic","self-critique","scalable"]),
|
|
("DORA: Weight Decomposition", "arxiv.org/abs/2402.09353", "DoRA fine-tuning — decompose weights into magnitude + direction for better LoRA", "finetuning-method", ["decomposition","lora-plus"]),
|
|
("Mamba / SSM", "arxiv.org/abs/2312.00752", "State Space Models — linear scaling vs Transformer's quadratic, Mamba architecture", "architecture", ["ssm","linear","alternative"]),
|
|
("Jamba", "arxiv.org/abs/2403.19887", "AI21's Mamba-Transformer hybrid — best of both, efficient long context", "architecture", ["ai21","hybrid","long-context"]),
|
|
("MoE (Mixture of Experts)", "arxiv.org/abs/2401.04088", "Sparse activation — only K of N experts per token, trillion-param models feasible", "architecture", ["sparse","efficient","scaling"]),
|
|
("Speculative Decoding", "arxiv.org/abs/2211.17192", "Small model drafts, large verifies — 2-3x faster inference, same output distribution", "inference-method", ["faster","parallel","same-output"]),
|
|
("Medusa heads", "arxiv.org/abs/2401.10774", "Multiple decoding heads for speculative — no draft model needed, 2x speedup", "inference-method", ["multi-head","fast","no-draft"]),
|
|
("KV Cache", "en.wikipedia.org/wiki/Transformer_(machine_learning_model)", "Transformer attention caching — reuse past K/V tensors, critical for long-gen inference", "inference-optim", ["caching","attention","critical"]),
|
|
("Flash Attention", "arxiv.org/abs/2205.14135", "IO-aware attention — 2-4x faster, memory-efficient, standard for training/inference", "inference-optim", ["io-aware","fast","memory"]),
|
|
("Flash Attention 3", "arxiv.org/abs/2407.08608", "Latest Flash Attention — H100-optimized, 1.5-2x faster than FA2", "inference-optim", ["h100","latest","fast"]),
|
|
("PagedAttention", "arxiv.org/abs/2309.06180", "vLLM's KV cache paging — OS virtual memory for LLMs, enables high throughput batching", "inference-optim", ["vllm","paging","throughput"]),
|
|
("Continuous Batching", "arxiv.org/abs/2307.01237", "Dynamic batching that adds/removes requests mid-batch — 23x throughput vs static", "inference-optim", ["dynamic","throughput","vllm"]),
|
|
("Sliding Window Attention", "arxiv.org/abs/2310.06825", "Mistral's attention pattern — fixed window, long sequences with constant memory", "architecture", ["mistral","window","memory"]),
|
|
("RoPE (Rotary Position Embedding)", "arxiv.org/abs/2104.09864", "Rotary positional encoding — Llama/Qwen/Mistral standard, extensible context", "architecture", ["positional","rotary","extensible"]),
|
|
("YaRN", "arxiv.org/abs/2309.00071", "RoPE context extension — interpolation + attention scaling, 128k+ contexts", "architecture", ["rope","context","extension"]),
|
|
|
|
# AI safety
|
|
("Red teaming", "arxiv.org/abs/2209.07858", "Adversarial probing of AI systems — find jailbreaks, harmful outputs, failure modes", "safety-practice", ["adversarial","safety","probing"]),
|
|
("Jailbreak", "wikipedia.org/wiki/Jailbreaking_(language_models)", "Bypass AI safety — prompt engineering to elicit restricted behavior", "safety-concept", ["attack","prompt","safety"]),
|
|
("Prompt injection", "owasp.org/www-project-top-10-for-large-language-model-applications", "LLM equivalent of SQL injection — malicious instructions in data fed to LLM", "safety-concept", ["attack","security","owasp"]),
|
|
("Data poisoning", "arxiv.org/abs/2302.10149", "Malicious training data to corrupt model — especially dangerous for fine-tuning", "safety-concept", ["attack","training","corrupt"]),
|
|
("Model extraction", "arxiv.org/abs/1609.02943", "Steal model via queries — reconstruct parameters or distill behavior", "safety-concept", ["attack","stealing","queries"]),
|
|
|
|
# Final misc batch
|
|
("Tiktoken", "github.com/openai/tiktoken", "OpenAI's tokenizer library — BPE tokenization for GPT models, de facto for token counting", "llm-tool", ["openai","tokenizer","bpe"]),
|
|
("SentencePiece", "github.com/google/sentencepiece", "Google's tokenizer — Llama/Mistral/Qwen use this, BPE/Unigram support", "llm-tool", ["google","tokenizer","bpe"]),
|
|
("Tokenizers (HF)", "github.com/huggingface/tokenizers", "HuggingFace Rust tokenizers — 1M+ tok/sec, all major algorithms", "llm-tool", ["huggingface","rust","fast"]),
|
|
]
|
|
|
|
def deterministic_id(idx, variation_idx):
|
|
h = hashlib.sha256(f"weval-oss-b2-p{idx}-v{variation_idx}".encode()).digest()
|
|
return ID_OFFSET + int.from_bytes(h[:6], 'big')
|
|
|
|
def make_variations(title, source, desc, category, tags):
|
|
tags_str = " ".join(tags)
|
|
variations = [
|
|
title,
|
|
f"{title}: {desc}",
|
|
f"{title} — {category}",
|
|
f"{title} [{tags_str}]",
|
|
f"What is {title}? {desc}",
|
|
f"{category}: {title}",
|
|
f"{title} from {source}",
|
|
f"{title} description: {desc[:100]}",
|
|
f"OSS pattern: {title} — tags: {', '.join(tags[:3])}",
|
|
f"{title} ({source}): {desc[:80]}",
|
|
]
|
|
return [v for v in variations if v and len(v) > 5]
|
|
|
|
def qdrant_upsert_batch(points):
|
|
data = json.dumps({"points": points}).encode()
|
|
req = urllib.request.Request(
|
|
f"{QDRANT_URL}/collections/{COLLECTION}/points?wait=true",
|
|
data=data, headers={"Content-Type": "application/json"}, method="PUT"
|
|
)
|
|
with urllib.request.urlopen(req, timeout=30) as r:
|
|
return json.loads(r.read())
|
|
|
|
def main():
|
|
from sentence_transformers import SentenceTransformer
|
|
print(f"Loaded {len(EXTRA)} additional OSS patterns")
|
|
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
|
|
|
all_texts, all_ids, all_payloads = [], [], []
|
|
for p_idx, (title, source, desc, category, tags) in enumerate(EXTRA):
|
|
for v_idx, text in enumerate(make_variations(title, source, desc, category, tags)):
|
|
all_texts.append(text)
|
|
all_ids.append(deterministic_id(p_idx, v_idx))
|
|
all_payloads.append({
|
|
"title": title, "source": source, "description": desc,
|
|
"category": category, "tags": tags,
|
|
"pattern_idx": p_idx, "variation_idx": v_idx,
|
|
"text": text[:500], "ingested_at": datetime.now().isoformat(),
|
|
"ingest_source": "ingest-oss-skills-batch2",
|
|
})
|
|
|
|
print(f"Total variations: {len(all_texts)}")
|
|
t0 = time.time()
|
|
embeddings = model.encode(all_texts, batch_size=64, show_progress_bar=False, convert_to_numpy=True)
|
|
print(f"Embeddings done in {round(time.time()-t0,1)}s")
|
|
|
|
total = 0
|
|
for i in range(0, len(all_texts), BATCH_SIZE):
|
|
pts = [{"id": all_ids[j], "vector": embeddings[j].tolist(), "payload": all_payloads[j]}
|
|
for j in range(i, min(i+BATCH_SIZE, len(all_texts)))]
|
|
try:
|
|
r = qdrant_upsert_batch(pts)
|
|
total += len(pts)
|
|
print(f" batch {i//BATCH_SIZE+1}: upserted {len(pts)} (total={total}) {r.get('result',{}).get('status')}")
|
|
except Exception as e:
|
|
print(f" batch FAIL: {e}")
|
|
|
|
req = urllib.request.Request(f"{QDRANT_URL}/collections/{COLLECTION}", method="GET")
|
|
with urllib.request.urlopen(req, timeout=10) as r:
|
|
d = json.loads(r.read())
|
|
print(f"\n═══ DONE · Ingested {total} · weval_skills now {d['result']['points_count']} points ═══")
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|