fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
214
backend/new-scrapers/fetch-dutchie-product.ts
Normal file
214
backend/new-scrapers/fetch-dutchie-product.ts
Normal file
@@ -0,0 +1,214 @@
|
||||
import { chromium as playwright } from 'playwright-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
playwright.use(StealthPlugin());
|
||||
|
||||
type ProductVariant = {
|
||||
label: string;
|
||||
price?: number;
|
||||
inventory?: string;
|
||||
};
|
||||
|
||||
type ProductData = {
|
||||
name: string;
|
||||
brand?: string;
|
||||
price?: number;
|
||||
description?: string;
|
||||
thc?: string;
|
||||
cbd?: string;
|
||||
category?: string;
|
||||
variants?: ProductVariant[];
|
||||
images: string[];
|
||||
productUrl: string;
|
||||
};
|
||||
|
||||
const PRODUCT_URL =
|
||||
'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted/product/mfused-loud-liquid-diamonds-aio-stoopid-gas';
|
||||
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'dutchie-product');
|
||||
const IMAGE_DIR = path.join(OUTPUT_DIR, 'images');
|
||||
const JSON_PATH = path.join(OUTPUT_DIR, 'product.json');
|
||||
|
||||
async function ensureDirs() {
|
||||
await fs.mkdir(IMAGE_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
async function waitForCloudflare(page: any, maxWaitMs = 60000): Promise<boolean> {
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < maxWaitMs) {
|
||||
const title = await page.title().catch(() => '');
|
||||
const content = await page.content().catch(() => '');
|
||||
const challenge =
|
||||
title.includes('Attention Required') ||
|
||||
title.includes('Just a moment') ||
|
||||
content.includes('challenge-platform') ||
|
||||
content.includes('cf-challenge');
|
||||
if (!challenge) return true;
|
||||
await page.waitForTimeout(2000);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
async function extractProduct(page: any): Promise<ProductData> {
|
||||
return page.evaluate(() => {
|
||||
const pickText = (selectorList: string[]): string | undefined => {
|
||||
for (const sel of selectorList) {
|
||||
const el = document.querySelector(sel) as HTMLElement | null;
|
||||
const txt = el?.innerText?.trim();
|
||||
if (txt) return txt;
|
||||
}
|
||||
return undefined;
|
||||
};
|
||||
|
||||
const pickAllTexts = (selector: string): string[] =>
|
||||
Array.from(document.querySelectorAll(selector))
|
||||
.map(el => (el as HTMLElement).innerText?.trim())
|
||||
.filter(Boolean) as string[];
|
||||
|
||||
const parsePrice = (text?: string | null): number | undefined => {
|
||||
if (!text) return undefined;
|
||||
const match = text.match(/\$?(\d+(?:\.\d{1,2})?)/);
|
||||
return match ? parseFloat(match[1]) : undefined;
|
||||
};
|
||||
|
||||
const name =
|
||||
pickText(['[data-testid="product-name"]', 'h1', '[class*="ProductTitle"]']) || '';
|
||||
const brand = pickText(['[data-testid="product-brand"]', '[class*="Brand"]']);
|
||||
const priceText =
|
||||
pickText([
|
||||
'[data-testid="product-price"]',
|
||||
'[data-testid*="price"]',
|
||||
'[class*="Price"]'
|
||||
]) || '';
|
||||
const description = pickText(['[data-testid="product-description"]', 'article', '[class*="Description"]']);
|
||||
|
||||
const potencyTexts = pickAllTexts('[data-testid*="thc"], [data-testid*="cbd"], [class*="Potency"]');
|
||||
const thc = potencyTexts.find(t => t.toLowerCase().includes('thc')) || undefined;
|
||||
const cbd = potencyTexts.find(t => t.toLowerCase().includes('cbd')) || undefined;
|
||||
|
||||
const category =
|
||||
pickText(['[data-testid="breadcrumb"]', '[class*="Breadcrumb"]', '[data-testid*="category"]']) || undefined;
|
||||
|
||||
const variantEls = Array.from(
|
||||
document.querySelectorAll('[data-testid*="variant"], [data-testid*="option"], [class*="Variant"]')
|
||||
);
|
||||
const variants = variantEls.map(el => {
|
||||
const label =
|
||||
(el.querySelector('span,div') as HTMLElement | null)?.innerText?.trim() ||
|
||||
el.textContent?.trim() ||
|
||||
'';
|
||||
const price = parsePrice(el.textContent || undefined);
|
||||
return { label, price };
|
||||
}).filter(v => v.label);
|
||||
|
||||
const imageUrls = Array.from(
|
||||
document.querySelectorAll('img[src*="images.dutchie.com"], source[srcset*="images.dutchie.com"], img[src*="https://images.dutchie.com"]')
|
||||
).map(el => {
|
||||
if (el instanceof HTMLImageElement) return el.src;
|
||||
const srcset = (el as HTMLSourceElement).srcset || '';
|
||||
return srcset.split(',')[0]?.trim().split(' ')[0];
|
||||
}).filter((u): u is string => !!u);
|
||||
|
||||
return {
|
||||
name,
|
||||
brand,
|
||||
price: parsePrice(priceText),
|
||||
description,
|
||||
thc,
|
||||
cbd,
|
||||
category,
|
||||
variants,
|
||||
images: Array.from(new Set(imageUrls)),
|
||||
productUrl: window.location.href,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function safeFileName(base: string, ext: string): string {
|
||||
return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'image'}.${ext}`;
|
||||
}
|
||||
|
||||
async function downloadImages(urls: string[]): Promise<string[]> {
|
||||
const saved: string[] = [];
|
||||
for (const url of urls) {
|
||||
try {
|
||||
const res = await fetch(url);
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
const buf = Buffer.from(await res.arrayBuffer());
|
||||
const contentType = res.headers.get('content-type') || '';
|
||||
const urlExt = path.extname(new URL(url).pathname).replace('.', '');
|
||||
const ext =
|
||||
urlExt ||
|
||||
(contentType.includes('png')
|
||||
? 'png'
|
||||
: contentType.includes('jpeg')
|
||||
? 'jpg'
|
||||
: contentType.includes('webp')
|
||||
? 'webp'
|
||||
: 'bin');
|
||||
const fileName = safeFileName(path.basename(url).split('.')[0] || 'image', ext);
|
||||
const filePath = path.join(IMAGE_DIR, fileName);
|
||||
await fs.writeFile(filePath, buf);
|
||||
saved.push(filePath);
|
||||
} catch (err) {
|
||||
console.warn(`Failed to download image ${url}:`, err);
|
||||
}
|
||||
}
|
||||
return saved;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await ensureDirs();
|
||||
|
||||
const browser = await playwright.launch({
|
||||
headless: true,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
});
|
||||
|
||||
const context = await browser.newContext({
|
||||
viewport: { width: 1280, height: 900 },
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
});
|
||||
|
||||
const page = await context.newPage();
|
||||
|
||||
try {
|
||||
console.log(`Navigating to product page...`);
|
||||
await page.goto(PRODUCT_URL, { waitUntil: 'domcontentloaded', timeout: 90000 });
|
||||
|
||||
const cfOk = await waitForCloudflare(page, 60000);
|
||||
if (!cfOk) {
|
||||
throw new Error('Cloudflare challenge not passed in time');
|
||||
}
|
||||
|
||||
await page.waitForSelector('[data-testid*="product"]', { timeout: 60000 }).catch(() => undefined);
|
||||
await page.waitForTimeout(2000);
|
||||
|
||||
const product = await extractProduct(page);
|
||||
console.log('Extracted product:');
|
||||
console.log(product);
|
||||
|
||||
const imagePaths = await downloadImages(product.images);
|
||||
const finalProduct = { ...product, imagePaths };
|
||||
|
||||
await fs.writeFile(JSON_PATH, JSON.stringify(finalProduct, null, 2));
|
||||
|
||||
console.log(`Saved product JSON to ${JSON_PATH}`);
|
||||
if (imagePaths.length) {
|
||||
console.log(`Saved ${imagePaths.length} images to ${IMAGE_DIR}`);
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Failed to scrape product:', err);
|
||||
process.exitCode = 1;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user