The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
115 lines
3.1 KiB
TypeScript
115 lines
3.1 KiB
TypeScript
import fs from 'fs/promises';
|
|
import path from 'path';
|
|
|
|
type RawProduct = {
|
|
name: string;
|
|
brand?: string;
|
|
price?: number;
|
|
size?: string;
|
|
category?: string;
|
|
url?: string;
|
|
imageUrl?: string;
|
|
inStock?: boolean;
|
|
};
|
|
|
|
type BrandGroup = {
|
|
brand: string;
|
|
products: CleanProduct[];
|
|
};
|
|
|
|
type CleanProduct = {
|
|
name: string;
|
|
brand: string;
|
|
price?: number;
|
|
size?: string;
|
|
category?: string;
|
|
url?: string;
|
|
imageUrl?: string;
|
|
inStock: boolean;
|
|
};
|
|
|
|
const INPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'inventory-by-brand.json');
|
|
const OUTPUT = path.join(process.cwd(), 'scrape-output', 'deeply-rooted', 'cleaned-inventory.json');
|
|
|
|
function extractPrice(text: string, fallback?: number): number | undefined {
|
|
const prices = [...text.matchAll(/\$([0-9]+(?:\.[0-9]{2})?)/g)].map((m) => parseFloat(m[1]));
|
|
if (prices.length > 0) {
|
|
// Use the lowest price (usually the sale price)
|
|
return Math.min(...prices);
|
|
}
|
|
return fallback;
|
|
}
|
|
|
|
function cleanBrandAndName(rawName: string, rawBrand?: string): { name: string; brand: string } {
|
|
const parts = rawName.split('…').map((p) => p.trim()).filter(Boolean);
|
|
const name = parts[0] || rawName.trim();
|
|
const inferredBrand = parts[1]?.replace(/[^a-z0-9\s\-\&']/gi, ' ').replace(/\s+/g, ' ').trim();
|
|
const brand = (rawBrand || inferredBrand || 'Unknown').trim();
|
|
return { name, brand };
|
|
}
|
|
|
|
function cleanProduct(p: RawProduct): CleanProduct {
|
|
const { name, brand } = cleanBrandAndName(p.name, p.brand);
|
|
const price = extractPrice(p.name, p.price);
|
|
return {
|
|
name,
|
|
brand: brand || 'Unknown',
|
|
price,
|
|
size: p.size,
|
|
category: p.category,
|
|
url: p.url,
|
|
imageUrl: p.imageUrl,
|
|
inStock: p.inStock !== false,
|
|
};
|
|
}
|
|
|
|
function dedupe(products: CleanProduct[]): CleanProduct[] {
|
|
const seen = new Map<string, CleanProduct>();
|
|
for (const p of products) {
|
|
const key = (p.url || `${p.name.toLowerCase()}|${p.brand.toLowerCase()}`).trim();
|
|
if (!seen.has(key)) {
|
|
seen.set(key, p);
|
|
}
|
|
}
|
|
return Array.from(seen.values());
|
|
}
|
|
|
|
function groupByBrand(products: CleanProduct[]): BrandGroup[] {
|
|
const map = new Map<string, CleanProduct[]>();
|
|
for (const p of products) {
|
|
const key = p.brand || 'Unknown';
|
|
if (!map.has(key)) map.set(key, []);
|
|
map.get(key)!.push(p);
|
|
}
|
|
return Array.from(map.entries()).map(([brand, prods]) => ({ brand, products: prods }));
|
|
}
|
|
|
|
async function main() {
|
|
const raw = JSON.parse(await fs.readFile(INPUT, 'utf8')) as { brand: string; products: RawProduct[] }[];
|
|
const flattened: CleanProduct[] = [];
|
|
|
|
for (const group of raw) {
|
|
for (const p of group.products) {
|
|
flattened.push(cleanProduct(p));
|
|
}
|
|
}
|
|
|
|
const unique = dedupe(flattened);
|
|
const grouped = groupByBrand(unique);
|
|
|
|
await fs.writeFile(OUTPUT, JSON.stringify(grouped, null, 2));
|
|
|
|
const total = unique.length;
|
|
const outOfStock = unique.filter((p) => !p.inStock).length;
|
|
|
|
console.log(`Cleaned products: ${total}`);
|
|
console.log(`Out of stock: ${outOfStock}`);
|
|
console.log(`Brands: ${grouped.length}`);
|
|
console.log(`Saved to ${OUTPUT}`);
|
|
}
|
|
|
|
main().catch((err) => {
|
|
console.error('Post-process failed:', err);
|
|
process.exitCode = 1;
|
|
});
|