fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
191
backend/new-scrapers/scrape-deeply-rooted-with-images.ts
Normal file
191
backend/new-scrapers/scrape-deeply-rooted-with-images.ts
Normal file
@@ -0,0 +1,191 @@
|
||||
import { chromium, Frame } from 'playwright';
|
||||
import fs from 'fs/promises';
|
||||
import path from 'path';
|
||||
|
||||
type Product = {
|
||||
name: string;
|
||||
brand?: string;
|
||||
price?: number;
|
||||
size?: string;
|
||||
category?: string;
|
||||
url?: string;
|
||||
imageUrl?: string;
|
||||
};
|
||||
|
||||
type ProductWithImagePath = Product & { imagePath?: string };
|
||||
|
||||
const TARGET_URL = 'https://azdeeplyrooted.com/menu';
|
||||
const OUTPUT_DIR = path.join(process.cwd(), 'scrape-output', 'deeply-rooted');
|
||||
const IMAGE_DIR = path.join(OUTPUT_DIR, 'images');
|
||||
const JSON_PATH = path.join(OUTPUT_DIR, 'products.json');
|
||||
|
||||
async function ensureDirs(): Promise<void> {
|
||||
await fs.mkdir(IMAGE_DIR, { recursive: true });
|
||||
}
|
||||
|
||||
async function getDutchieFrame(page: any): Promise<Frame> {
|
||||
const iframeHandle = await page.waitForSelector(
|
||||
'iframe[src*="dutchie"], iframe[srcdoc*="dutchie"], iframe[id^="iframe-"]',
|
||||
{ timeout: 45000 }
|
||||
);
|
||||
|
||||
const frame = await iframeHandle.contentFrame();
|
||||
if (!frame) {
|
||||
throw new Error('Unable to access embedded Dutchie iframe.');
|
||||
}
|
||||
|
||||
await frame.waitForLoadState('domcontentloaded', { timeout: 30000 });
|
||||
return frame;
|
||||
}
|
||||
|
||||
async function loadAllProducts(frame: Frame): Promise<void> {
|
||||
const maxScrolls = 30;
|
||||
for (let i = 0; i < maxScrolls; i++) {
|
||||
const beforeCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
|
||||
await frame.mouse.wheel(0, 1200);
|
||||
await frame.waitForTimeout(900);
|
||||
const afterCount = await frame.$$eval('[data-testid*="product"], [data-testid*="card"]', els => els.length);
|
||||
if (afterCount <= beforeCount) break;
|
||||
}
|
||||
await frame.evaluate(() => window.scrollTo({ top: 0 }));
|
||||
}
|
||||
|
||||
async function extractProducts(frame: Frame): Promise<Product[]> {
|
||||
return frame.evaluate(() => {
|
||||
const cards = Array.from(
|
||||
document.querySelectorAll('[data-testid="product-list-item"], [data-testid="card-link"], [data-testid*="product-card"]')
|
||||
);
|
||||
|
||||
const pickImage = (card: Element): string | undefined => {
|
||||
const imgEl =
|
||||
(card.querySelector('img[src^="http"]') as HTMLImageElement | null) ||
|
||||
(card.querySelector('source[srcset]') as HTMLSourceElement | null);
|
||||
if (imgEl && 'src' in imgEl && typeof imgEl.src === 'string' && imgEl.src.startsWith('http')) {
|
||||
return imgEl.src;
|
||||
}
|
||||
if (imgEl && 'srcset' in imgEl && typeof (imgEl as any).srcset === 'string') {
|
||||
const first = (imgEl as any).srcset.split(',')[0]?.trim().split(' ')[0];
|
||||
if (first?.startsWith('http')) return first;
|
||||
}
|
||||
const dataSrc = (card.querySelector('img[data-src]') as HTMLImageElement | null)?.getAttribute('data-src');
|
||||
if (dataSrc?.startsWith('http')) return dataSrc;
|
||||
return undefined;
|
||||
};
|
||||
|
||||
return cards
|
||||
.map((card: Element) => {
|
||||
const name =
|
||||
(card.querySelector('[data-testid="product-card-name"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('[data-testid="product-name"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('h3, h4') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.textContent || '').split('\n').map(t => t.trim()).find(t => t.length > 3) ||
|
||||
'';
|
||||
|
||||
const brand =
|
||||
(card.querySelector('[data-testid="product-card-brand"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('[data-testid="product-brand"]') as HTMLElement)?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const priceText =
|
||||
(card.querySelector('[data-testid="product-card-price"]') as HTMLElement)?.innerText ||
|
||||
(card.textContent || '');
|
||||
const priceMatch = priceText.match(/\$?(\d+(?:\.\d{2})?)/);
|
||||
const price = priceMatch ? parseFloat(priceMatch[1]) : undefined;
|
||||
|
||||
const size =
|
||||
(card.querySelector('[data-testid*="size"]') as HTMLElement)?.innerText?.trim() ||
|
||||
(card.querySelector('[data-testid*="weight"]') as HTMLElement)?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const category =
|
||||
(card.querySelector('[data-testid*="category"]') as HTMLElement)?.innerText?.trim() ||
|
||||
undefined;
|
||||
|
||||
const link = card.querySelector('a[href*="/product/"]') as HTMLAnchorElement | null;
|
||||
const url = link?.href;
|
||||
|
||||
const imageUrl = pickImage(card);
|
||||
|
||||
return { name, brand, price, size, category, url, imageUrl };
|
||||
})
|
||||
.filter(p => p.name);
|
||||
});
|
||||
}
|
||||
|
||||
function safeFileName(base: string, ext: string): string {
|
||||
return `${base.toLowerCase().replace(/[^a-z0-9]+/g, '-').replace(/^-|-$/g, '') || 'product'}.${ext}`;
|
||||
}
|
||||
|
||||
async function downloadImages(products: Product[]): Promise<ProductWithImagePath[]> {
|
||||
const results: ProductWithImagePath[] = [];
|
||||
for (const product of products) {
|
||||
if (!product.imageUrl) {
|
||||
results.push(product);
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(product.imageUrl);
|
||||
if (!res.ok) throw new Error(`HTTP ${res.status}`);
|
||||
const arrayBuffer = await res.arrayBuffer();
|
||||
const contentType = res.headers.get('content-type') || '';
|
||||
const extFromType =
|
||||
contentType.includes('png') ? 'png' :
|
||||
contentType.includes('jpeg') ? 'jpg' :
|
||||
contentType.includes('jpg') ? 'jpg' :
|
||||
contentType.includes('webp') ? 'webp' :
|
||||
contentType.includes('gif') ? 'gif' : 'bin';
|
||||
|
||||
const urlExt = path.extname(new URL(product.imageUrl).pathname).replace('.', '');
|
||||
const ext = urlExt || extFromType || 'bin';
|
||||
const fileName = safeFileName(product.name || 'product', ext);
|
||||
const filePath = path.join(IMAGE_DIR, fileName);
|
||||
await fs.writeFile(filePath, Buffer.from(arrayBuffer));
|
||||
results.push({ ...product, imagePath: filePath });
|
||||
} catch (err) {
|
||||
console.warn(`Failed to download image for ${product.name}: ${err}`);
|
||||
results.push(product);
|
||||
}
|
||||
}
|
||||
return results;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
await ensureDirs();
|
||||
|
||||
const browser = await chromium.launch({
|
||||
headless: true,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-blink-features=AutomationControlled'],
|
||||
});
|
||||
|
||||
const page = await browser.newPage({
|
||||
viewport: { width: 1300, height: 900 },
|
||||
userAgent:
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
});
|
||||
|
||||
try {
|
||||
console.log(`Navigating to ${TARGET_URL}...`);
|
||||
await page.goto(TARGET_URL, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
||||
|
||||
const frame = await getDutchieFrame(page);
|
||||
await frame.waitForSelector('[data-testid*="product"], [data-testid*="card"]', { timeout: 60000 }).catch(() => undefined);
|
||||
|
||||
await loadAllProducts(frame);
|
||||
const products = await extractProducts(frame);
|
||||
console.log(`Found ${products.length} products, downloading images...`);
|
||||
|
||||
const withImages = await downloadImages(products);
|
||||
await fs.writeFile(JSON_PATH, JSON.stringify(withImages, null, 2));
|
||||
|
||||
console.log(`Saved data to ${JSON_PATH}`);
|
||||
console.log(`Images stored in ${IMAGE_DIR}`);
|
||||
} catch (err) {
|
||||
console.error('Scrape failed:', err);
|
||||
process.exitCode = 1;
|
||||
} finally {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
|
||||
main();
|
||||
Reference in New Issue
Block a user