Fix category-crawler-jobs store lookup query

- Fix column name from s.dutchie_plus_url to s.dutchie_url
- Add availability tracking and product freshness APIs
- Add crawl script for sequential dispensary processing

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-01 00:07:00 -07:00
parent 20a7b69537
commit 9d8972aa86
15 changed files with 11604 additions and 42 deletions

View File

@@ -6,6 +6,55 @@ import { getImageUrl } from '../utils/minio';
const router = Router();
router.use(authMiddleware);
// Freshness threshold: data older than this is considered stale
const STALE_THRESHOLD_HOURS = 4;
interface FreshnessInfo {
last_crawl_at: string | null;
is_stale: boolean;
freshness: string;
hours_since_crawl: number | null;
}
function calculateFreshness(lastCrawlAt: Date | null): FreshnessInfo {
if (!lastCrawlAt) {
return {
last_crawl_at: null,
is_stale: true,
freshness: 'Never crawled',
hours_since_crawl: null
};
}
const now = new Date();
const diffMs = now.getTime() - lastCrawlAt.getTime();
const diffHours = diffMs / (1000 * 60 * 60);
const isStale = diffHours > STALE_THRESHOLD_HOURS;
let freshnessText: string;
if (diffHours < 1) {
const mins = Math.round(diffHours * 60);
freshnessText = `Last crawled ${mins} minute${mins !== 1 ? 's' : ''} ago`;
} else if (diffHours < 24) {
const hrs = Math.round(diffHours);
freshnessText = `Last crawled ${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
} else {
const days = Math.round(diffHours / 24);
freshnessText = `Last crawled ${days} day${days !== 1 ? 's' : ''} ago`;
}
if (isStale) {
freshnessText += ' (STALE)';
}
return {
last_crawl_at: lastCrawlAt.toISOString(),
is_stale: isStale,
freshness: freshnessText,
hours_since_crawl: Math.round(diffHours * 10) / 10
};
}
// Helper function to filter fields from object
function selectFields(obj: any, fields: string[]): any {
if (!fields || fields.length === 0) return obj;
@@ -216,11 +265,35 @@ router.get('/', async (req, res) => {
const countResult = await pool.query(countQuery, countParams);
// Get freshness info if store_id is specified
let freshnessInfo: FreshnessInfo | null = null;
let storeInfo: { id: number; name: string } | null = null;
if (store_id) {
const storeResult = await pool.query(
'SELECT id, name, last_scraped_at FROM stores WHERE id = $1',
[store_id]
);
if (storeResult.rows.length > 0) {
const store = storeResult.rows[0];
storeInfo = { id: store.id, name: store.name };
freshnessInfo = calculateFreshness(store.last_scraped_at);
}
}
res.json({
products,
total: parseInt(countResult.rows[0].count),
limit: parseInt(limit as string),
offset: parseInt(offset as string),
// Add freshness metadata when store_id is provided
...(freshnessInfo && {
store: storeInfo,
last_crawl_at: freshnessInfo.last_crawl_at,
is_stale: freshnessInfo.is_stale,
freshness: freshnessInfo.freshness,
hours_since_crawl: freshnessInfo.hours_since_crawl
}),
filters: {
store_id,
category_id,

View File

@@ -28,28 +28,150 @@ router.get('/', async (req, res) => {
}
});
// Get single store
// Freshness threshold in hours
const STALE_THRESHOLD_HOURS = 4;
function calculateFreshness(lastScrapedAt: Date | null): {
last_scraped_at: string | null;
is_stale: boolean;
freshness: string;
hours_since_scrape: number | null;
} {
if (!lastScrapedAt) {
return {
last_scraped_at: null,
is_stale: true,
freshness: 'Never scraped',
hours_since_scrape: null
};
}
const now = new Date();
const diffMs = now.getTime() - lastScrapedAt.getTime();
const diffHours = diffMs / (1000 * 60 * 60);
const isStale = diffHours > STALE_THRESHOLD_HOURS;
let freshnessText: string;
if (diffHours < 1) {
const mins = Math.round(diffHours * 60);
freshnessText = `${mins} minute${mins !== 1 ? 's' : ''} ago`;
} else if (diffHours < 24) {
const hrs = Math.round(diffHours);
freshnessText = `${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
} else {
const days = Math.round(diffHours / 24);
freshnessText = `${days} day${days !== 1 ? 's' : ''} ago`;
}
return {
last_scraped_at: lastScrapedAt.toISOString(),
is_stale: isStale,
freshness: freshnessText,
hours_since_scrape: Math.round(diffHours * 10) / 10
};
}
function detectProvider(dutchieUrl: string | null): string {
if (!dutchieUrl) return 'unknown';
if (dutchieUrl.includes('dutchie.com')) return 'Dutchie';
if (dutchieUrl.includes('iheartjane.com') || dutchieUrl.includes('jane.co')) return 'Jane';
if (dutchieUrl.includes('treez.io')) return 'Treez';
if (dutchieUrl.includes('weedmaps.com')) return 'Weedmaps';
if (dutchieUrl.includes('leafly.com')) return 'Leafly';
return 'Custom';
}
// Get single store with full details
router.get('/:id', async (req, res) => {
try {
const { id } = req.params;
// Get store with counts and linked dispensary
const result = await pool.query(`
SELECT
SELECT
s.*,
d.id as dispensary_id,
d.name as dispensary_name,
d.slug as dispensary_slug,
d.state as dispensary_state,
d.city as dispensary_city,
d.address as dispensary_address,
d.menu_provider as dispensary_menu_provider,
COUNT(DISTINCT p.id) as product_count,
COUNT(DISTINCT c.id) as category_count
COUNT(DISTINCT c.id) as category_count,
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = true) as in_stock_count,
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = false) as out_of_stock_count
FROM stores s
LEFT JOIN dispensaries d ON s.dispensary_id = d.id
LEFT JOIN products p ON s.id = p.store_id
LEFT JOIN categories c ON s.id = c.store_id
WHERE s.id = $1
GROUP BY s.id
GROUP BY s.id, d.id, d.name, d.slug, d.state, d.city, d.address, d.menu_provider
`, [id]);
if (result.rows.length === 0) {
return res.status(404).json({ error: 'Store not found' });
}
res.json(result.rows[0]);
const store = result.rows[0];
// Get recent crawl jobs for this store
const jobsResult = await pool.query(`
SELECT
id, status, job_type, trigger_type,
started_at, completed_at,
products_found, products_new, products_updated,
in_stock_count, out_of_stock_count,
error_message
FROM crawl_jobs
WHERE store_id = $1
ORDER BY created_at DESC
LIMIT 10
`, [id]);
// Get schedule info if exists
const scheduleResult = await pool.query(`
SELECT
enabled, interval_hours, next_run_at, last_run_at
FROM store_crawl_schedule
WHERE store_id = $1
`, [id]);
// Calculate freshness
const freshness = calculateFreshness(store.last_scraped_at);
// Detect provider from URL
const provider = detectProvider(store.dutchie_url);
// Build response
const response = {
...store,
provider,
freshness: freshness.freshness,
is_stale: freshness.is_stale,
hours_since_scrape: freshness.hours_since_scrape,
linked_dispensary: store.dispensary_id ? {
id: store.dispensary_id,
name: store.dispensary_name,
slug: store.dispensary_slug,
state: store.dispensary_state,
city: store.dispensary_city,
address: store.dispensary_address,
menu_provider: store.dispensary_menu_provider
} : null,
schedule: scheduleResult.rows[0] || null,
recent_jobs: jobsResult.rows
};
// Remove redundant dispensary fields from root
delete response.dispensary_name;
delete response.dispensary_slug;
delete response.dispensary_state;
delete response.dispensary_city;
delete response.dispensary_address;
delete response.dispensary_menu_provider;
res.json(response);
} catch (error) {
console.error('Error fetching store:', error);
res.status(500).json({ error: 'Failed to fetch store' });

View File

@@ -0,0 +1,26 @@
import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator';
// Run 5 crawlers sequentially to avoid OOM
const dispensaryIds = [112, 81, 115, 140, 177];
async function run() {
console.log('Starting 5 crawlers SEQUENTIALLY...');
for (const id of dispensaryIds) {
console.log(`\n=== Starting crawler for dispensary ${id} ===`);
try {
const result = await runDispensaryOrchestrator(id);
console.log(` Status: ${result.status}`);
console.log(` Summary: ${result.summary}`);
if (result.productsFound) {
console.log(` Products: ${result.productsFound} found, ${result.productsNew} new, ${result.productsUpdated} updated`);
}
} catch (e: any) {
console.log(` ERROR: ${e.message}`);
}
}
console.log('\n=== All 5 crawlers complete ===');
}
run().catch(e => console.log('Fatal:', e.message));

View File

@@ -0,0 +1,240 @@
/**
* Availability Service
*
* Normalizes product availability from various menu providers and tracks
* state transitions for inventory analytics.
*/
// Threshold for considering stock as "limited"
const LIMITED_THRESHOLD = 5;
export type AvailabilityStatus = 'in_stock' | 'out_of_stock' | 'limited' | 'unknown';
export interface NormalizedAvailability {
status: AvailabilityStatus;
quantity: number | null;
raw: any;
}
export interface AvailabilityHints {
hasOutOfStockBadge?: boolean;
hasLimitedBadge?: boolean;
hasInStockBadge?: boolean;
stockText?: string;
quantityText?: string;
}
/**
* Normalize availability from a Dutchie product
*
* Dutchie products can have various availability indicators:
* - potencyAmount.quantity: explicit stock count
* - status: sometimes includes stock status
* - variants[].quantity: stock per variant
* - isInStock / inStock: boolean flags
*/
export function normalizeAvailability(dutchieProduct: any): NormalizedAvailability {
const raw: any = {};
// Collect raw availability data for debugging
if (dutchieProduct.potencyAmount?.quantity !== undefined) {
raw.potencyQuantity = dutchieProduct.potencyAmount.quantity;
}
if (dutchieProduct.status !== undefined) {
raw.status = dutchieProduct.status;
}
if (dutchieProduct.isInStock !== undefined) {
raw.isInStock = dutchieProduct.isInStock;
}
if (dutchieProduct.inStock !== undefined) {
raw.inStock = dutchieProduct.inStock;
}
if (dutchieProduct.variants?.length) {
const variantQuantities = dutchieProduct.variants
.filter((v: any) => v.quantity !== undefined)
.map((v: any) => ({ option: v.option, quantity: v.quantity }));
if (variantQuantities.length) {
raw.variantQuantities = variantQuantities;
}
}
// Try to extract quantity
let quantity: number | null = null;
// Check potencyAmount.quantity first (most reliable for Dutchie)
if (typeof dutchieProduct.potencyAmount?.quantity === 'number') {
quantity = dutchieProduct.potencyAmount.quantity;
}
// Sum variant quantities if available
else if (dutchieProduct.variants?.length) {
const totalVariantQty = dutchieProduct.variants.reduce((sum: number, v: any) => {
return sum + (typeof v.quantity === 'number' ? v.quantity : 0);
}, 0);
if (totalVariantQty > 0) {
quantity = totalVariantQty;
}
}
// Determine status
let status: AvailabilityStatus = 'unknown';
// Explicit boolean flags take precedence
if (dutchieProduct.isInStock === false || dutchieProduct.inStock === false) {
status = 'out_of_stock';
} else if (dutchieProduct.isInStock === true || dutchieProduct.inStock === true) {
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
}
// Check status string
else if (typeof dutchieProduct.status === 'string') {
const statusLower = dutchieProduct.status.toLowerCase();
if (statusLower.includes('out') || statusLower.includes('unavailable')) {
status = 'out_of_stock';
} else if (statusLower.includes('limited') || statusLower.includes('low')) {
status = 'limited';
} else if (statusLower.includes('in') || statusLower.includes('available')) {
status = 'in_stock';
}
}
// Infer from quantity
else if (quantity !== null) {
if (quantity === 0) {
status = 'out_of_stock';
} else if (quantity <= LIMITED_THRESHOLD) {
status = 'limited';
} else {
status = 'in_stock';
}
}
return { status, quantity, raw };
}
/**
* Extract availability hints from page content or product card HTML
*
* Used for sandbox provider scraping where we don't have structured data
*/
export function extractAvailabilityHints(pageContent: string, productElement?: string): AvailabilityHints {
const hints: AvailabilityHints = {};
const content = (productElement || pageContent).toLowerCase();
// Check for out-of-stock indicators
const oosPatterns = [
'out of stock',
'out-of-stock',
'sold out',
'soldout',
'unavailable',
'not available',
'coming soon',
'notify me'
];
hints.hasOutOfStockBadge = oosPatterns.some(p => content.includes(p));
// Check for limited stock indicators
const limitedPatterns = [
'limited stock',
'limited quantity',
'low stock',
'only \\d+ left',
'few remaining',
'almost gone',
'selling fast'
];
hints.hasLimitedBadge = limitedPatterns.some(p => {
if (p.includes('\\d')) {
return new RegExp(p, 'i').test(content);
}
return content.includes(p);
});
// Check for in-stock indicators
const inStockPatterns = [
'in stock',
'in-stock',
'add to cart',
'add to bag',
'buy now',
'available'
];
hints.hasInStockBadge = inStockPatterns.some(p => content.includes(p));
// Try to extract quantity text
const qtyMatch = content.match(/(\d+)\s*(left|remaining|in stock|available)/i);
if (qtyMatch) {
hints.quantityText = qtyMatch[0];
}
// Look for explicit stock text
const stockTextMatch = content.match(/(out of stock|in stock|low stock|limited|sold out)[^<]*/i);
if (stockTextMatch) {
hints.stockText = stockTextMatch[0].trim();
}
return hints;
}
/**
* Convert availability hints to normalized availability
*/
export function hintsToAvailability(hints: AvailabilityHints): NormalizedAvailability {
let status: AvailabilityStatus = 'unknown';
let quantity: number | null = null;
// Extract quantity if present
if (hints.quantityText) {
const match = hints.quantityText.match(/(\d+)/);
if (match) {
quantity = parseInt(match[1], 10);
}
}
// Determine status from hints
if (hints.hasOutOfStockBadge) {
status = 'out_of_stock';
} else if (hints.hasLimitedBadge) {
status = 'limited';
} else if (hints.hasInStockBadge) {
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
}
return {
status,
quantity,
raw: hints
};
}
/**
* Aggregate availability counts from a list of products
*/
export interface AvailabilityCounts {
in_stock: number;
out_of_stock: number;
limited: number;
unknown: number;
changed: number;
}
export function aggregateAvailability(
products: Array<{ availability_status?: AvailabilityStatus; previous_status?: AvailabilityStatus }>
): AvailabilityCounts {
const counts: AvailabilityCounts = {
in_stock: 0,
out_of_stock: 0,
limited: 0,
unknown: 0,
changed: 0
};
for (const product of products) {
const status = product.availability_status || 'unknown';
counts[status]++;
if (product.previous_status && product.previous_status !== status) {
counts.changed++;
}
}
return counts;
}

View File

@@ -106,9 +106,10 @@ async function updateCategoryScanTime(
}
async function getStoreIdForDispensary(dispensaryId: number): Promise<number | null> {
// First check if dispensary has menu_url - if so, try to match with stores.dutchie_url
const result = await pool.query(
`SELECT s.id FROM stores s
JOIN dispensaries d ON d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%'
JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
WHERE d.id = $1
LIMIT 1`,
[dispensaryId]
@@ -118,6 +119,7 @@ async function getStoreIdForDispensary(dispensaryId: number): Promise<number | n
return result.rows[0].id;
}
// Try matching by slug
const result2 = await pool.query(
`SELECT s.id FROM stores s
JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'

View File

@@ -14,6 +14,7 @@ import { scrapeStore } from '../scraper-v2';
import puppeteer, { Browser, Page } from 'puppeteer';
import { promises as fs } from 'fs';
import path from 'path';
import { extractAvailabilityHints } from './availability';
const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
@@ -500,9 +501,13 @@ export async function runSandboxCrawlJob(dispensaryId: number, sandboxId?: numbe
};
});
// Extract availability hints from page content
const availabilityHints = extractAvailabilityHints(html);
analysisData.page_structures.push({
url,
...structure,
availabilityHints,
});
}

View File

@@ -8,6 +8,7 @@ import { logger } from './logger';
import { registerScraper, updateScraperStats, completeScraper } from '../routes/scraper-monitor';
import { incrementProxyFailure, getActiveProxy, isBotDetectionError, putProxyInTimeout } from './proxy';
import { bypassAgeGate, detectStateFromUrl, setAgeGateCookies } from '../utils/age-gate';
import { normalizeAvailability, AvailabilityStatus } from './availability';
// Apply stealth plugin for antidetect/anti-fingerprinting
puppeteer.use(StealthPlugin());
@@ -35,6 +36,10 @@ interface Product {
imageUrl?: string;
dutchieUrl: string;
metadata?: any;
// Availability tracking
availabilityStatus?: AvailabilityStatus;
availabilityRaw?: any;
stockQuantity?: number | null;
}
export const USER_AGENTS = {
@@ -584,6 +589,8 @@ export async function scrapeCategory(storeId: number, categoryId: number, userAg
const formattedProducts: Product[] = products.map((p, index) => {
const sanitized = sanitizeProductData(p);
// Normalize availability from Dutchie product data
const availability = normalizeAvailability(p);
return {
dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
@@ -599,7 +606,10 @@ export async function scrapeCategory(storeId: number, categoryId: number, userAg
weight: sanitized.weight,
imageUrl: p.imageUrl,
dutchieUrl: p.href,
metadata: p.metadata || {}
metadata: p.metadata || {},
availabilityStatus: availability.status,
availabilityRaw: availability.raw,
stockQuantity: availability.quantity
};
});
@@ -660,47 +670,72 @@ async function autoScroll(page: Page) {
export async function saveProducts(storeId: number, categoryId: number, products: Product[]): Promise<void> {
const client = await pool.connect();
try {
await client.query('BEGIN');
logger.info('scraper', `Saving ${products.length} products to database...`);
// Mark all products as out-of-stock before processing (they'll be re-marked if found)
// Also update availability_status and last_seen_out_of_stock_at for state transition tracking
await client.query(`
UPDATE products
SET in_stock = false
WHERE store_id = $1 AND category_id = $2
SET in_stock = false,
availability_status = 'out_of_stock',
last_seen_out_of_stock_at = CASE
WHEN availability_status != 'out_of_stock' THEN CURRENT_TIMESTAMP
ELSE last_seen_out_of_stock_at
END
WHERE store_id = $1 AND category_id = $2 AND in_stock = true
`, [storeId, categoryId]);
for (const product of products) {
try {
// Get availability from product (defaults to in_stock if product exists in scraped data)
const availStatus = product.availabilityStatus || 'in_stock';
const availRaw = product.availabilityRaw ? JSON.stringify(product.availabilityRaw) : null;
const stockQty = product.stockQuantity ?? null;
const existingResult = await client.query(`
SELECT id, image_url, local_image_path
SELECT id, image_url, local_image_path, availability_status
FROM products
WHERE store_id = $1 AND name = $2 AND category_id = $3
AND (variant = $4 OR (variant IS NULL AND $4 IS NULL))
`, [storeId, product.name, categoryId, product.variant || null]);
let localImagePath = null;
let productId: number;
if (existingResult.rows.length > 0) {
productId = existingResult.rows[0].id;
localImagePath = existingResult.rows[0].local_image_path;
const prevStatus = existingResult.rows[0].availability_status;
// Determine if we need to update last_seen_in_stock_at
const isNowInStock = availStatus === 'in_stock' || availStatus === 'limited';
const wasOutOfStock = prevStatus === 'out_of_stock' || prevStatus === 'unknown';
await client.query(`
UPDATE products
SET name = $1, variant = $2, description = $3, price = $4,
strain_type = $5, thc_percentage = $6, cbd_percentage = $7,
brand = $8, weight = $9, image_url = $10, dutchie_url = $11,
in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP,
updated_at = CURRENT_TIMESTAMP
updated_at = CURRENT_TIMESTAMP,
availability_status = $14,
availability_raw = $15,
stock_quantity = $16,
last_seen_in_stock_at = CASE
WHEN $17 THEN CURRENT_TIMESTAMP
ELSE last_seen_in_stock_at
END
WHERE id = $13
`, [
product.name, product.variant, product.description, product.price,
product.strainType, product.thcPercentage, product.cbdPercentage,
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
JSON.stringify(product.metadata), productId
JSON.stringify(product.metadata), productId, availStatus, availRaw, stockQty,
isNowInStock && wasOutOfStock
]);
} else {
// Generate unique slug from product name + timestamp + random suffix
@@ -716,14 +751,15 @@ export async function saveProducts(storeId: number, categoryId: number, products
INSERT INTO products (
store_id, category_id, dutchie_product_id, name, slug, variant, description,
price, strain_type, thc_percentage, cbd_percentage,
brand, weight, image_url, dutchie_url, in_stock, metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16)
brand, weight, image_url, dutchie_url, in_stock, metadata,
availability_status, availability_raw, stock_quantity, last_seen_in_stock_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16, $17, $18, $19, CURRENT_TIMESTAMP)
RETURNING id
`, [
storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description,
product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
JSON.stringify(product.metadata)
JSON.stringify(product.metadata), availStatus, availRaw, stockQty
]);
productId = insertResult.rows[0].id;