179 lines
8.5 KiB
Python
179 lines
8.5 KiB
Python
import asyncio, time
|
|
from playwright.async_api import async_playwright
|
|
|
|
TRAINING_CODE = """
|
|
!pip install -q unsloth datasets trl peft accelerate bitsandbytes
|
|
!wget -q 'https://huggingface.co/datasets/yace222/weval-brain-dataset/resolve/main/train_chatml.jsonl' -O /kaggle/working/dataset.jsonl
|
|
!wc -l /kaggle/working/dataset.jsonl
|
|
|
|
from unsloth import FastLanguageModel
|
|
import torch
|
|
model, tokenizer = FastLanguageModel.from_pretrained(model_name='unsloth/Qwen2.5-7B-Instruct-bnb-4bit', max_seq_length=2048, load_in_4bit=True)
|
|
model = FastLanguageModel.get_peft_model(model, r=16, target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'], lora_alpha=16, lora_dropout=0, bias='none', use_gradient_checkpointing='unsloth')
|
|
print('Model loaded!')
|
|
|
|
from datasets import load_dataset
|
|
dataset = load_dataset('json', data_files='/kaggle/working/dataset.jsonl', split='train')
|
|
def format_chat(example):
|
|
text = tokenizer.apply_chat_template(example['messages'], tokenize=False, add_generation_prompt=False)
|
|
return {'text': text}
|
|
dataset = dataset.map(format_chat)
|
|
print(f'Formatted: {len(dataset)} examples')
|
|
|
|
from trl import SFTTrainer
|
|
from transformers import TrainingArguments
|
|
trainer = SFTTrainer(model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field='text', max_seq_length=2048, args=TrainingArguments(per_device_train_batch_size=2, gradient_accumulation_steps=4, warmup_steps=5, num_train_epochs=3, learning_rate=2e-4, fp16=not torch.cuda.is_bf16_supported(), bf16=torch.cuda.is_bf16_supported(), logging_steps=1, output_dir='/kaggle/working/outputs', optim='adamw_8bit', seed=42))
|
|
stats = trainer.train()
|
|
print(f'Done! Loss: {stats.training_loss:.4f}')
|
|
|
|
model.save_pretrained_gguf('/kaggle/working/weval-brain-gguf', tokenizer, quantization_method='q4_k_m')
|
|
print('GGUF saved!')
|
|
import os
|
|
for f in os.listdir('/kaggle/working/weval-brain-gguf'):
|
|
sz = os.path.getsize(os.path.join('/kaggle/working/weval-brain-gguf', f)) // 1024 // 1024
|
|
print(f'{f}: {sz}MB')
|
|
|
|
# Push to HuggingFace
|
|
model.push_to_hub_gguf('yace222/weval-brain-v3-gguf', tokenizer, quantization_method='q4_k_m', token=os.environ.get('HF_TOKEN',''))
|
|
print('Pushed to HuggingFace!')
|
|
"""
|
|
|
|
async def run():
|
|
async with async_playwright() as p:
|
|
ctx = await p.chromium.launch_persistent_context(
|
|
"/tmp/chrome-kaggle", headless=True,
|
|
args=["--no-sandbox", "--disable-blink-features=AutomationControlled"],
|
|
viewport={"width": 1920, "height": 1080},
|
|
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
)
|
|
page = ctx.pages[0] if ctx.pages else await ctx.new_page()
|
|
await page.add_init_script("Object.defineProperty(navigator,'webdriver',{get:()=>undefined})")
|
|
|
|
# Step 1: Login to Kaggle
|
|
print(f"[{time.strftime('%H:%M:%S')}] Step 1: Kaggle login...")
|
|
await page.goto("https://www.kaggle.com/account/login", timeout=30000)
|
|
await page.wait_for_timeout(5000)
|
|
|
|
title = await page.title()
|
|
url = page.url
|
|
print(f"Page: {title} | URL: {url[:60]}")
|
|
await page.screenshot(path="/tmp/kaggle-login.png")
|
|
|
|
# Click "Sign in with Google"
|
|
try:
|
|
google_btn = page.locator("text=Sign in with Google, button:has-text('Google'), [data-testid*='google']")
|
|
if await google_btn.count() > 0:
|
|
await google_btn.first.click()
|
|
await page.wait_for_timeout(5000)
|
|
print("Google sign-in clicked")
|
|
else:
|
|
# Try finding any sign-in button
|
|
btns = await page.query_selector_all("button, a")
|
|
for btn in btns:
|
|
text = (await btn.inner_text()).strip()
|
|
if "google" in text.lower() or "sign in" in text.lower():
|
|
print(f"Clicking: {text}")
|
|
await btn.click()
|
|
await page.wait_for_timeout(5000)
|
|
break
|
|
except Exception as e:
|
|
print(f"Sign-in click: {e}")
|
|
|
|
await page.screenshot(path="/tmp/kaggle-after-click.png")
|
|
url2 = page.url
|
|
print(f"After click: {url2[:80]}")
|
|
|
|
# If on Google login
|
|
if "accounts.google" in url2:
|
|
try:
|
|
await page.fill('input[type="email"]', 'yacineutt@gmail.com')
|
|
await page.wait_for_timeout(500)
|
|
await page.click('#identifierNext')
|
|
await page.wait_for_timeout(4000)
|
|
|
|
pwd = page.locator('input[type="password"]')
|
|
if await pwd.count() > 0:
|
|
await pwd.fill('YacineWeval2026')
|
|
await page.wait_for_timeout(500)
|
|
await page.click('#passwordNext')
|
|
await page.wait_for_timeout(5000)
|
|
print("Google login submitted")
|
|
except Exception as e:
|
|
print(f"Google login: {e}")
|
|
|
|
await page.screenshot(path="/tmp/kaggle-logged.png")
|
|
url3 = page.url
|
|
print(f"After login: {url3[:80]}")
|
|
|
|
# Step 2: Get API key from settings
|
|
if "kaggle.com" in url3:
|
|
print(f"\n[{time.strftime('%H:%M:%S')}] Step 2: Getting API key...")
|
|
await page.goto("https://www.kaggle.com/settings", timeout=30000)
|
|
await page.wait_for_timeout(5000)
|
|
await page.screenshot(path="/tmp/kaggle-settings.png")
|
|
|
|
# Look for API section and Create New Token
|
|
try:
|
|
create_token = page.locator("text=Create New Token, button:has-text('Create New Token'), text=Create New API Token")
|
|
if await create_token.count() > 0:
|
|
await create_token.first.click()
|
|
await page.wait_for_timeout(3000)
|
|
print("API Token created!")
|
|
await page.screenshot(path="/tmp/kaggle-token.png")
|
|
except Exception as e:
|
|
print(f"Token: {e}")
|
|
|
|
# Step 3: Create notebook with GPU
|
|
print(f"\n[{time.strftime('%H:%M:%S')}] Step 3: Creating notebook...")
|
|
await page.goto("https://www.kaggle.com/code/new", timeout=30000)
|
|
await page.wait_for_timeout(8000)
|
|
await page.screenshot(path="/tmp/kaggle-notebook.png")
|
|
|
|
title3 = await page.title()
|
|
url4 = page.url
|
|
print(f"Notebook: {title3} | {url4[:60]}")
|
|
|
|
# Check if notebook editor loaded
|
|
html = await page.content()
|
|
has_editor = "editor" in html.lower() or "cell" in html.lower() or "code" in html.lower()
|
|
print(f"Editor loaded: {has_editor}")
|
|
|
|
if has_editor:
|
|
# Paste training code
|
|
print("Pasting training code...")
|
|
await page.keyboard.type(TRAINING_CODE, delay=1)
|
|
await page.wait_for_timeout(2000)
|
|
await page.screenshot(path="/tmp/kaggle-code.png")
|
|
print("Code pasted!")
|
|
|
|
# Try to enable GPU
|
|
# Kaggle has accelerator settings in sidebar
|
|
try:
|
|
settings_btn = page.locator("text=Settings, [aria-label*='settings'], button:has-text('Settings')")
|
|
if await settings_btn.count() > 0:
|
|
await settings_btn.first.click()
|
|
await page.wait_for_timeout(2000)
|
|
|
|
gpu_option = page.locator("text=GPU, text=T4, text=P100")
|
|
if await gpu_option.count() > 0:
|
|
await gpu_option.first.click()
|
|
print("GPU selected!")
|
|
except Exception as e:
|
|
print(f"GPU settings: {e}")
|
|
|
|
# Run all - Kaggle uses Shift+Enter for each cell or Run All button
|
|
await page.keyboard.press("Shift+Enter")
|
|
await page.wait_for_timeout(5000)
|
|
await page.screenshot(path="/tmp/kaggle-running.png")
|
|
print("EXECUTION STARTED!")
|
|
|
|
# Copy all screenshots
|
|
import subprocess
|
|
for f in ['kaggle-login.png','kaggle-logged.png','kaggle-settings.png','kaggle-notebook.png','kaggle-code.png','kaggle-running.png']:
|
|
subprocess.run(['cp', f'/tmp/{f}', f'/var/www/html/api/notebooks/{f}'], capture_output=True)
|
|
|
|
await ctx.close()
|
|
print("\nDONE")
|
|
|
|
asyncio.run(run())
|