fix(monitor): remove non-existent worker columns from job_run_logs query

The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-03 18:45:05 -07:00
parent 54f40d26bb
commit 66e07b2009
466 changed files with 84988 additions and 9226 deletions

View File

@@ -0,0 +1,214 @@
import { chromium as playwright } from 'playwright-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import fs from 'fs/promises';
import path from 'path';
playwright.use(StealthPlugin());
type ProductVariant = {
label: string;
price?: number;
inventory?: string;
};
type ProductData = {
name: string;
brand?: string;
price?: number;
description?: string;
thc?: string;
cbd?: string;
category?: string;
variants?: ProductVariant[];
images: string[];
productUrl: string;
};
const PRODUCT_URL =
'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/product/mfused-loud-liquid-diamonds-aio-stoopid-gas';
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'dutchie-product');
const IMAGE_DIR = path.join(OUTPUT_DIR, 'images');
const JSON_PATH = path.join(OUTPUT_DIR, 'product.json');
async function ensureDirs() {
await fs.mkdir(IMAGE_DIR, { recursive: true });
}
async function waitForCloudflare(page: any, maxWaitMs = 60000): Promise<boolean> {
const start = Date.now();
while (Date.now() - start < maxWaitMs) {
const title = await page.title().catch(() => '');
const content = await page.content().catch(() => '');
const challenge =
title.includes('Attention Required') ||
title.includes('Just a moment') ||
content.includes('challenge-platform') ||
content.includes('cf-challenge');
if (!challenge) return true;
await page.waitForTimeout(2000);
}
return false;
}
async function extractProduct(page: any): Promise<ProductData> {
return page.evaluate(() => {
const pickText = (selectorList: string[]): string | undefined => {
for (const sel of selectorList) {
const el = document.querySelector(sel) as HTMLElement | null;
const txt = el?.innerText?.trim();
if (txt) return txt;
}
return undefined;
};
const pickAllTexts = (selector: string): string[] =>
Array.from(document.querySelectorAll(selector))
.map(el => (el as HTMLElement).innerText?.trim())
.filter(Boolean) as string[];
const parsePrice = (text?: string | null): number | undefined => {
if (!text) return undefined;
const match = text.match(/\$?(\d+(?:\.\d{1,2})?)/);
return match ? parseFloat(match[1]) : undefined;
};
const name =
pickText(['[data-testid="product-name"]', 'h1', '[class*="ProductTitle"]']) || '';
const brand = pickText(['[data-testid="product-brand"]', '[class*="Brand"]']);
const priceText =
pickText([
'[data-testid="product-price"]',
'[data-testid*="price"]',
'[class*="Price"]'
]) || '';
const description = pickText(['[data-testid="product-description"]', 'article', '[class*="Description"]']);
const potencyTexts = pickAllTexts('[data-testid*="thc"], [data-testid*="cbd"], [class*="Potency"]');
const thc = potencyTexts.find(t => t.toLowerCase().includes('thc')) || undefined;
const cbd = potencyTexts.find(t => t.toLowerCase().includes('cbd')) || undefined;
const category =
pickText(['[data-testid="breadcrumb"]', '[class*="Breadcrumb"]', '[data-testid*="category"]']) || undefined;
const variantEls = Array.from(
document.querySelectorAll('[data-testid*="variant"], [data-testid*="option"], [class*="Variant"]')
);
const variants = variantEls.map(el => {
const label =
(el.querySelector('span,div') as HTMLElement | null)?.innerText?.trim() ||
el.textContent?.trim() ||
'';
const price = parsePrice(el.textContent || undefined);
return { label, price };
}).filter(v => v.label);
const imageUrls = Array.from(
document.querySelectorAll('img[src*="images.dutchie.com"], source[srcset*="images.dutchie.com"], img[src*="https://images.dutchie.com"]')
).map(el => {
if (el instanceof HTMLImageElement) return el.src;
const srcset = (el as HTMLSourceElement).srcset || '';
return srcset.split(',')[0]?.trim().split(' ')[0];
}).filter((u): u is string => !!u);
return {
name,
brand,
price: parsePrice(priceText),
description,
thc,
cbd,
category,
variants,
images: Array.from(new Set(imageUrls)),
productUrl: window.location.href,
};
});
}
function safeFileName(base: string, ext: string): string {
return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'image'}.${ext}`;
}
async function downloadImages(urls: string[]): Promise<string[]> {
const saved: string[] = [];
for (const url of urls) {
try {
const res = await fetch(url);
if (!res.ok) throw new Error(`HTTP ${res.status}`);
const buf = Buffer.from(await res.arrayBuffer());
const contentType = res.headers.get('content-type') || '';
const urlExt = path.extname(new URL(url).pathname).replace('.', '');
const ext =
urlExt ||
(contentType.includes('png')
? 'png'
: contentType.includes('jpeg')
? 'jpg'
: contentType.includes('webp')
? 'webp'
: 'bin');
const fileName = safeFileName(path.basename(url).split('.')[0] || 'image', ext);
const filePath = path.join(IMAGE_DIR, fileName);
await fs.writeFile(filePath, buf);
saved.push(filePath);
} catch (err) {
console.warn(`Failed to download image ${url}:`, err);
}
}
return saved;
}
async function main() {
await ensureDirs();
const browser = await playwright.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled',
],
});
const context = await browser.newContext({
viewport: { width: 1280, height: 900 },
userAgent:
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
});
const page = await context.newPage();
try {
console.log(`Navigating to product page...`);
await page.goto(PRODUCT_URL, { waitUntil: 'domcontentloaded', timeout: 90000 });
const cfOk = await waitForCloudflare(page, 60000);
if (!cfOk) {
throw new Error('Cloudflare challenge not passed in time');
}
await page.waitForSelector('[data-testid*="product"]', { timeout: 60000 }).catch(() => undefined);
await page.waitForTimeout(2000);
const product = await extractProduct(page);
console.log('Extracted product:');
console.log(product);
const imagePaths = await downloadImages(product.images);
const finalProduct = { ...product, imagePaths };
await fs.writeFile(JSON_PATH, JSON.stringify(finalProduct, null, 2));
console.log(`Saved product JSON to ${JSON_PATH}`);
if (imagePaths.length) {
console.log(`Saved ${imagePaths.length} images to ${IMAGE_DIR}`);
}
} catch (err) {
console.error('Failed to scrape product:', err);
process.exitCode = 1;
} finally {
await browser.close();
}
}
main();