Fix category-crawler-jobs store lookup query
- Fix column name from s.dutchie_plus_url to s.dutchie_url - Add availability tracking and product freshness APIs - Add crawl script for sequential dispensary processing 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
52
backend/migrations/024_product_availability_tracking.sql
Normal file
52
backend/migrations/024_product_availability_tracking.sql
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
-- Migration 024: Product Availability Tracking
|
||||||
|
-- Adds normalized availability status and transition tracking
|
||||||
|
|
||||||
|
-- Add availability columns to products table
|
||||||
|
ALTER TABLE products ADD COLUMN IF NOT EXISTS availability_status VARCHAR(20) DEFAULT 'unknown';
|
||||||
|
ALTER TABLE products ADD COLUMN IF NOT EXISTS availability_raw JSONB;
|
||||||
|
ALTER TABLE products ADD COLUMN IF NOT EXISTS last_seen_in_stock_at TIMESTAMPTZ;
|
||||||
|
ALTER TABLE products ADD COLUMN IF NOT EXISTS last_seen_out_of_stock_at TIMESTAMPTZ;
|
||||||
|
|
||||||
|
-- Add comment for clarity
|
||||||
|
COMMENT ON COLUMN products.availability_status IS 'Normalized status: in_stock, out_of_stock, limited, unknown';
|
||||||
|
COMMENT ON COLUMN products.availability_raw IS 'Raw availability payload from provider for debugging';
|
||||||
|
COMMENT ON COLUMN products.last_seen_in_stock_at IS 'Last time product was seen in stock';
|
||||||
|
COMMENT ON COLUMN products.last_seen_out_of_stock_at IS 'Last time product was seen out of stock';
|
||||||
|
|
||||||
|
-- Create indexes for availability queries
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_availability_status ON products(availability_status);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_availability_by_store ON products(store_id, availability_status) WHERE store_id IS NOT NULL;
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_products_availability_by_dispensary ON products(dispensary_id, availability_status) WHERE dispensary_id IS NOT NULL;
|
||||||
|
|
||||||
|
-- Backfill availability_status from existing in_stock column
|
||||||
|
UPDATE products
|
||||||
|
SET availability_status = CASE
|
||||||
|
WHEN in_stock = true THEN 'in_stock'
|
||||||
|
WHEN in_stock = false THEN 'out_of_stock'
|
||||||
|
ELSE 'unknown'
|
||||||
|
END
|
||||||
|
WHERE availability_status = 'unknown' OR availability_status IS NULL;
|
||||||
|
|
||||||
|
-- Set last_seen_in_stock_at for currently in-stock products
|
||||||
|
UPDATE products
|
||||||
|
SET last_seen_in_stock_at = COALESCE(last_seen_at, updated_at, NOW())
|
||||||
|
WHERE in_stock = true AND last_seen_in_stock_at IS NULL;
|
||||||
|
|
||||||
|
-- Set last_seen_out_of_stock_at for currently out-of-stock products
|
||||||
|
UPDATE products
|
||||||
|
SET last_seen_out_of_stock_at = COALESCE(last_seen_at, updated_at, NOW())
|
||||||
|
WHERE in_stock = false AND last_seen_out_of_stock_at IS NULL;
|
||||||
|
|
||||||
|
-- Add availability tracking to dispensary_crawl_jobs
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN IF NOT EXISTS in_stock_count INTEGER;
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN IF NOT EXISTS out_of_stock_count INTEGER;
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN IF NOT EXISTS limited_count INTEGER;
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN IF NOT EXISTS unknown_count INTEGER;
|
||||||
|
ALTER TABLE dispensary_crawl_jobs ADD COLUMN IF NOT EXISTS availability_changed_count INTEGER;
|
||||||
|
|
||||||
|
-- Add availability tracking to crawl_jobs (store-based)
|
||||||
|
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS in_stock_count INTEGER;
|
||||||
|
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS out_of_stock_count INTEGER;
|
||||||
|
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS limited_count INTEGER;
|
||||||
|
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS unknown_count INTEGER;
|
||||||
|
ALTER TABLE crawl_jobs ADD COLUMN IF NOT EXISTS availability_changed_count INTEGER;
|
||||||
@@ -6,6 +6,55 @@ import { getImageUrl } from '../utils/minio';
|
|||||||
const router = Router();
|
const router = Router();
|
||||||
router.use(authMiddleware);
|
router.use(authMiddleware);
|
||||||
|
|
||||||
|
// Freshness threshold: data older than this is considered stale
|
||||||
|
const STALE_THRESHOLD_HOURS = 4;
|
||||||
|
|
||||||
|
interface FreshnessInfo {
|
||||||
|
last_crawl_at: string | null;
|
||||||
|
is_stale: boolean;
|
||||||
|
freshness: string;
|
||||||
|
hours_since_crawl: number | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function calculateFreshness(lastCrawlAt: Date | null): FreshnessInfo {
|
||||||
|
if (!lastCrawlAt) {
|
||||||
|
return {
|
||||||
|
last_crawl_at: null,
|
||||||
|
is_stale: true,
|
||||||
|
freshness: 'Never crawled',
|
||||||
|
hours_since_crawl: null
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const now = new Date();
|
||||||
|
const diffMs = now.getTime() - lastCrawlAt.getTime();
|
||||||
|
const diffHours = diffMs / (1000 * 60 * 60);
|
||||||
|
const isStale = diffHours > STALE_THRESHOLD_HOURS;
|
||||||
|
|
||||||
|
let freshnessText: string;
|
||||||
|
if (diffHours < 1) {
|
||||||
|
const mins = Math.round(diffHours * 60);
|
||||||
|
freshnessText = `Last crawled ${mins} minute${mins !== 1 ? 's' : ''} ago`;
|
||||||
|
} else if (diffHours < 24) {
|
||||||
|
const hrs = Math.round(diffHours);
|
||||||
|
freshnessText = `Last crawled ${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
|
||||||
|
} else {
|
||||||
|
const days = Math.round(diffHours / 24);
|
||||||
|
freshnessText = `Last crawled ${days} day${days !== 1 ? 's' : ''} ago`;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isStale) {
|
||||||
|
freshnessText += ' (STALE)';
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
last_crawl_at: lastCrawlAt.toISOString(),
|
||||||
|
is_stale: isStale,
|
||||||
|
freshness: freshnessText,
|
||||||
|
hours_since_crawl: Math.round(diffHours * 10) / 10
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// Helper function to filter fields from object
|
// Helper function to filter fields from object
|
||||||
function selectFields(obj: any, fields: string[]): any {
|
function selectFields(obj: any, fields: string[]): any {
|
||||||
if (!fields || fields.length === 0) return obj;
|
if (!fields || fields.length === 0) return obj;
|
||||||
@@ -216,11 +265,35 @@ router.get('/', async (req, res) => {
|
|||||||
|
|
||||||
const countResult = await pool.query(countQuery, countParams);
|
const countResult = await pool.query(countQuery, countParams);
|
||||||
|
|
||||||
|
// Get freshness info if store_id is specified
|
||||||
|
let freshnessInfo: FreshnessInfo | null = null;
|
||||||
|
let storeInfo: { id: number; name: string } | null = null;
|
||||||
|
|
||||||
|
if (store_id) {
|
||||||
|
const storeResult = await pool.query(
|
||||||
|
'SELECT id, name, last_scraped_at FROM stores WHERE id = $1',
|
||||||
|
[store_id]
|
||||||
|
);
|
||||||
|
if (storeResult.rows.length > 0) {
|
||||||
|
const store = storeResult.rows[0];
|
||||||
|
storeInfo = { id: store.id, name: store.name };
|
||||||
|
freshnessInfo = calculateFreshness(store.last_scraped_at);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
res.json({
|
res.json({
|
||||||
products,
|
products,
|
||||||
total: parseInt(countResult.rows[0].count),
|
total: parseInt(countResult.rows[0].count),
|
||||||
limit: parseInt(limit as string),
|
limit: parseInt(limit as string),
|
||||||
offset: parseInt(offset as string),
|
offset: parseInt(offset as string),
|
||||||
|
// Add freshness metadata when store_id is provided
|
||||||
|
...(freshnessInfo && {
|
||||||
|
store: storeInfo,
|
||||||
|
last_crawl_at: freshnessInfo.last_crawl_at,
|
||||||
|
is_stale: freshnessInfo.is_stale,
|
||||||
|
freshness: freshnessInfo.freshness,
|
||||||
|
hours_since_crawl: freshnessInfo.hours_since_crawl
|
||||||
|
}),
|
||||||
filters: {
|
filters: {
|
||||||
store_id,
|
store_id,
|
||||||
category_id,
|
category_id,
|
||||||
|
|||||||
@@ -28,28 +28,150 @@ router.get('/', async (req, res) => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Get single store
|
// Freshness threshold in hours
|
||||||
|
const STALE_THRESHOLD_HOURS = 4;
|
||||||
|
|
||||||
|
function calculateFreshness(lastScrapedAt: Date | null): {
|
||||||
|
last_scraped_at: string | null;
|
||||||
|
is_stale: boolean;
|
||||||
|
freshness: string;
|
||||||
|
hours_since_scrape: number | null;
|
||||||
|
} {
|
||||||
|
if (!lastScrapedAt) {
|
||||||
|
return {
|
||||||
|
last_scraped_at: null,
|
||||||
|
is_stale: true,
|
||||||
|
freshness: 'Never scraped',
|
||||||
|
hours_since_scrape: null
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const now = new Date();
|
||||||
|
const diffMs = now.getTime() - lastScrapedAt.getTime();
|
||||||
|
const diffHours = diffMs / (1000 * 60 * 60);
|
||||||
|
const isStale = diffHours > STALE_THRESHOLD_HOURS;
|
||||||
|
|
||||||
|
let freshnessText: string;
|
||||||
|
if (diffHours < 1) {
|
||||||
|
const mins = Math.round(diffHours * 60);
|
||||||
|
freshnessText = `${mins} minute${mins !== 1 ? 's' : ''} ago`;
|
||||||
|
} else if (diffHours < 24) {
|
||||||
|
const hrs = Math.round(diffHours);
|
||||||
|
freshnessText = `${hrs} hour${hrs !== 1 ? 's' : ''} ago`;
|
||||||
|
} else {
|
||||||
|
const days = Math.round(diffHours / 24);
|
||||||
|
freshnessText = `${days} day${days !== 1 ? 's' : ''} ago`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
last_scraped_at: lastScrapedAt.toISOString(),
|
||||||
|
is_stale: isStale,
|
||||||
|
freshness: freshnessText,
|
||||||
|
hours_since_scrape: Math.round(diffHours * 10) / 10
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectProvider(dutchieUrl: string | null): string {
|
||||||
|
if (!dutchieUrl) return 'unknown';
|
||||||
|
if (dutchieUrl.includes('dutchie.com')) return 'Dutchie';
|
||||||
|
if (dutchieUrl.includes('iheartjane.com') || dutchieUrl.includes('jane.co')) return 'Jane';
|
||||||
|
if (dutchieUrl.includes('treez.io')) return 'Treez';
|
||||||
|
if (dutchieUrl.includes('weedmaps.com')) return 'Weedmaps';
|
||||||
|
if (dutchieUrl.includes('leafly.com')) return 'Leafly';
|
||||||
|
return 'Custom';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get single store with full details
|
||||||
router.get('/:id', async (req, res) => {
|
router.get('/:id', async (req, res) => {
|
||||||
try {
|
try {
|
||||||
const { id } = req.params;
|
const { id } = req.params;
|
||||||
|
|
||||||
|
// Get store with counts and linked dispensary
|
||||||
const result = await pool.query(`
|
const result = await pool.query(`
|
||||||
SELECT
|
SELECT
|
||||||
s.*,
|
s.*,
|
||||||
|
d.id as dispensary_id,
|
||||||
|
d.name as dispensary_name,
|
||||||
|
d.slug as dispensary_slug,
|
||||||
|
d.state as dispensary_state,
|
||||||
|
d.city as dispensary_city,
|
||||||
|
d.address as dispensary_address,
|
||||||
|
d.menu_provider as dispensary_menu_provider,
|
||||||
COUNT(DISTINCT p.id) as product_count,
|
COUNT(DISTINCT p.id) as product_count,
|
||||||
COUNT(DISTINCT c.id) as category_count
|
COUNT(DISTINCT c.id) as category_count,
|
||||||
|
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = true) as in_stock_count,
|
||||||
|
COUNT(DISTINCT p.id) FILTER (WHERE p.in_stock = false) as out_of_stock_count
|
||||||
FROM stores s
|
FROM stores s
|
||||||
|
LEFT JOIN dispensaries d ON s.dispensary_id = d.id
|
||||||
LEFT JOIN products p ON s.id = p.store_id
|
LEFT JOIN products p ON s.id = p.store_id
|
||||||
LEFT JOIN categories c ON s.id = c.store_id
|
LEFT JOIN categories c ON s.id = c.store_id
|
||||||
WHERE s.id = $1
|
WHERE s.id = $1
|
||||||
GROUP BY s.id
|
GROUP BY s.id, d.id, d.name, d.slug, d.state, d.city, d.address, d.menu_provider
|
||||||
`, [id]);
|
`, [id]);
|
||||||
|
|
||||||
if (result.rows.length === 0) {
|
if (result.rows.length === 0) {
|
||||||
return res.status(404).json({ error: 'Store not found' });
|
return res.status(404).json({ error: 'Store not found' });
|
||||||
}
|
}
|
||||||
|
|
||||||
res.json(result.rows[0]);
|
const store = result.rows[0];
|
||||||
|
|
||||||
|
// Get recent crawl jobs for this store
|
||||||
|
const jobsResult = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
id, status, job_type, trigger_type,
|
||||||
|
started_at, completed_at,
|
||||||
|
products_found, products_new, products_updated,
|
||||||
|
in_stock_count, out_of_stock_count,
|
||||||
|
error_message
|
||||||
|
FROM crawl_jobs
|
||||||
|
WHERE store_id = $1
|
||||||
|
ORDER BY created_at DESC
|
||||||
|
LIMIT 10
|
||||||
|
`, [id]);
|
||||||
|
|
||||||
|
// Get schedule info if exists
|
||||||
|
const scheduleResult = await pool.query(`
|
||||||
|
SELECT
|
||||||
|
enabled, interval_hours, next_run_at, last_run_at
|
||||||
|
FROM store_crawl_schedule
|
||||||
|
WHERE store_id = $1
|
||||||
|
`, [id]);
|
||||||
|
|
||||||
|
// Calculate freshness
|
||||||
|
const freshness = calculateFreshness(store.last_scraped_at);
|
||||||
|
|
||||||
|
// Detect provider from URL
|
||||||
|
const provider = detectProvider(store.dutchie_url);
|
||||||
|
|
||||||
|
// Build response
|
||||||
|
const response = {
|
||||||
|
...store,
|
||||||
|
provider,
|
||||||
|
freshness: freshness.freshness,
|
||||||
|
is_stale: freshness.is_stale,
|
||||||
|
hours_since_scrape: freshness.hours_since_scrape,
|
||||||
|
linked_dispensary: store.dispensary_id ? {
|
||||||
|
id: store.dispensary_id,
|
||||||
|
name: store.dispensary_name,
|
||||||
|
slug: store.dispensary_slug,
|
||||||
|
state: store.dispensary_state,
|
||||||
|
city: store.dispensary_city,
|
||||||
|
address: store.dispensary_address,
|
||||||
|
menu_provider: store.dispensary_menu_provider
|
||||||
|
} : null,
|
||||||
|
schedule: scheduleResult.rows[0] || null,
|
||||||
|
recent_jobs: jobsResult.rows
|
||||||
|
};
|
||||||
|
|
||||||
|
// Remove redundant dispensary fields from root
|
||||||
|
delete response.dispensary_name;
|
||||||
|
delete response.dispensary_slug;
|
||||||
|
delete response.dispensary_state;
|
||||||
|
delete response.dispensary_city;
|
||||||
|
delete response.dispensary_address;
|
||||||
|
delete response.dispensary_menu_provider;
|
||||||
|
|
||||||
|
res.json(response);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('Error fetching store:', error);
|
console.error('Error fetching store:', error);
|
||||||
res.status(500).json({ error: 'Failed to fetch store' });
|
res.status(500).json({ error: 'Failed to fetch store' });
|
||||||
|
|||||||
26
backend/src/scripts/crawl-five-sequential.ts
Normal file
26
backend/src/scripts/crawl-five-sequential.ts
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator';
|
||||||
|
|
||||||
|
// Run 5 crawlers sequentially to avoid OOM
|
||||||
|
const dispensaryIds = [112, 81, 115, 140, 177];
|
||||||
|
|
||||||
|
async function run() {
|
||||||
|
console.log('Starting 5 crawlers SEQUENTIALLY...');
|
||||||
|
|
||||||
|
for (const id of dispensaryIds) {
|
||||||
|
console.log(`\n=== Starting crawler for dispensary ${id} ===`);
|
||||||
|
try {
|
||||||
|
const result = await runDispensaryOrchestrator(id);
|
||||||
|
console.log(` Status: ${result.status}`);
|
||||||
|
console.log(` Summary: ${result.summary}`);
|
||||||
|
if (result.productsFound) {
|
||||||
|
console.log(` Products: ${result.productsFound} found, ${result.productsNew} new, ${result.productsUpdated} updated`);
|
||||||
|
}
|
||||||
|
} catch (e: any) {
|
||||||
|
console.log(` ERROR: ${e.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log('\n=== All 5 crawlers complete ===');
|
||||||
|
}
|
||||||
|
|
||||||
|
run().catch(e => console.log('Fatal:', e.message));
|
||||||
240
backend/src/services/availability.ts
Normal file
240
backend/src/services/availability.ts
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
/**
|
||||||
|
* Availability Service
|
||||||
|
*
|
||||||
|
* Normalizes product availability from various menu providers and tracks
|
||||||
|
* state transitions for inventory analytics.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Threshold for considering stock as "limited"
|
||||||
|
const LIMITED_THRESHOLD = 5;
|
||||||
|
|
||||||
|
export type AvailabilityStatus = 'in_stock' | 'out_of_stock' | 'limited' | 'unknown';
|
||||||
|
|
||||||
|
export interface NormalizedAvailability {
|
||||||
|
status: AvailabilityStatus;
|
||||||
|
quantity: number | null;
|
||||||
|
raw: any;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface AvailabilityHints {
|
||||||
|
hasOutOfStockBadge?: boolean;
|
||||||
|
hasLimitedBadge?: boolean;
|
||||||
|
hasInStockBadge?: boolean;
|
||||||
|
stockText?: string;
|
||||||
|
quantityText?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize availability from a Dutchie product
|
||||||
|
*
|
||||||
|
* Dutchie products can have various availability indicators:
|
||||||
|
* - potencyAmount.quantity: explicit stock count
|
||||||
|
* - status: sometimes includes stock status
|
||||||
|
* - variants[].quantity: stock per variant
|
||||||
|
* - isInStock / inStock: boolean flags
|
||||||
|
*/
|
||||||
|
export function normalizeAvailability(dutchieProduct: any): NormalizedAvailability {
|
||||||
|
const raw: any = {};
|
||||||
|
|
||||||
|
// Collect raw availability data for debugging
|
||||||
|
if (dutchieProduct.potencyAmount?.quantity !== undefined) {
|
||||||
|
raw.potencyQuantity = dutchieProduct.potencyAmount.quantity;
|
||||||
|
}
|
||||||
|
if (dutchieProduct.status !== undefined) {
|
||||||
|
raw.status = dutchieProduct.status;
|
||||||
|
}
|
||||||
|
if (dutchieProduct.isInStock !== undefined) {
|
||||||
|
raw.isInStock = dutchieProduct.isInStock;
|
||||||
|
}
|
||||||
|
if (dutchieProduct.inStock !== undefined) {
|
||||||
|
raw.inStock = dutchieProduct.inStock;
|
||||||
|
}
|
||||||
|
if (dutchieProduct.variants?.length) {
|
||||||
|
const variantQuantities = dutchieProduct.variants
|
||||||
|
.filter((v: any) => v.quantity !== undefined)
|
||||||
|
.map((v: any) => ({ option: v.option, quantity: v.quantity }));
|
||||||
|
if (variantQuantities.length) {
|
||||||
|
raw.variantQuantities = variantQuantities;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try to extract quantity
|
||||||
|
let quantity: number | null = null;
|
||||||
|
|
||||||
|
// Check potencyAmount.quantity first (most reliable for Dutchie)
|
||||||
|
if (typeof dutchieProduct.potencyAmount?.quantity === 'number') {
|
||||||
|
quantity = dutchieProduct.potencyAmount.quantity;
|
||||||
|
}
|
||||||
|
// Sum variant quantities if available
|
||||||
|
else if (dutchieProduct.variants?.length) {
|
||||||
|
const totalVariantQty = dutchieProduct.variants.reduce((sum: number, v: any) => {
|
||||||
|
return sum + (typeof v.quantity === 'number' ? v.quantity : 0);
|
||||||
|
}, 0);
|
||||||
|
if (totalVariantQty > 0) {
|
||||||
|
quantity = totalVariantQty;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine status
|
||||||
|
let status: AvailabilityStatus = 'unknown';
|
||||||
|
|
||||||
|
// Explicit boolean flags take precedence
|
||||||
|
if (dutchieProduct.isInStock === false || dutchieProduct.inStock === false) {
|
||||||
|
status = 'out_of_stock';
|
||||||
|
} else if (dutchieProduct.isInStock === true || dutchieProduct.inStock === true) {
|
||||||
|
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
|
||||||
|
}
|
||||||
|
// Check status string
|
||||||
|
else if (typeof dutchieProduct.status === 'string') {
|
||||||
|
const statusLower = dutchieProduct.status.toLowerCase();
|
||||||
|
if (statusLower.includes('out') || statusLower.includes('unavailable')) {
|
||||||
|
status = 'out_of_stock';
|
||||||
|
} else if (statusLower.includes('limited') || statusLower.includes('low')) {
|
||||||
|
status = 'limited';
|
||||||
|
} else if (statusLower.includes('in') || statusLower.includes('available')) {
|
||||||
|
status = 'in_stock';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Infer from quantity
|
||||||
|
else if (quantity !== null) {
|
||||||
|
if (quantity === 0) {
|
||||||
|
status = 'out_of_stock';
|
||||||
|
} else if (quantity <= LIMITED_THRESHOLD) {
|
||||||
|
status = 'limited';
|
||||||
|
} else {
|
||||||
|
status = 'in_stock';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return { status, quantity, raw };
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract availability hints from page content or product card HTML
|
||||||
|
*
|
||||||
|
* Used for sandbox provider scraping where we don't have structured data
|
||||||
|
*/
|
||||||
|
export function extractAvailabilityHints(pageContent: string, productElement?: string): AvailabilityHints {
|
||||||
|
const hints: AvailabilityHints = {};
|
||||||
|
const content = (productElement || pageContent).toLowerCase();
|
||||||
|
|
||||||
|
// Check for out-of-stock indicators
|
||||||
|
const oosPatterns = [
|
||||||
|
'out of stock',
|
||||||
|
'out-of-stock',
|
||||||
|
'sold out',
|
||||||
|
'soldout',
|
||||||
|
'unavailable',
|
||||||
|
'not available',
|
||||||
|
'coming soon',
|
||||||
|
'notify me'
|
||||||
|
];
|
||||||
|
hints.hasOutOfStockBadge = oosPatterns.some(p => content.includes(p));
|
||||||
|
|
||||||
|
// Check for limited stock indicators
|
||||||
|
const limitedPatterns = [
|
||||||
|
'limited stock',
|
||||||
|
'limited quantity',
|
||||||
|
'low stock',
|
||||||
|
'only \\d+ left',
|
||||||
|
'few remaining',
|
||||||
|
'almost gone',
|
||||||
|
'selling fast'
|
||||||
|
];
|
||||||
|
hints.hasLimitedBadge = limitedPatterns.some(p => {
|
||||||
|
if (p.includes('\\d')) {
|
||||||
|
return new RegExp(p, 'i').test(content);
|
||||||
|
}
|
||||||
|
return content.includes(p);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Check for in-stock indicators
|
||||||
|
const inStockPatterns = [
|
||||||
|
'in stock',
|
||||||
|
'in-stock',
|
||||||
|
'add to cart',
|
||||||
|
'add to bag',
|
||||||
|
'buy now',
|
||||||
|
'available'
|
||||||
|
];
|
||||||
|
hints.hasInStockBadge = inStockPatterns.some(p => content.includes(p));
|
||||||
|
|
||||||
|
// Try to extract quantity text
|
||||||
|
const qtyMatch = content.match(/(\d+)\s*(left|remaining|in stock|available)/i);
|
||||||
|
if (qtyMatch) {
|
||||||
|
hints.quantityText = qtyMatch[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look for explicit stock text
|
||||||
|
const stockTextMatch = content.match(/(out of stock|in stock|low stock|limited|sold out)[^<]*/i);
|
||||||
|
if (stockTextMatch) {
|
||||||
|
hints.stockText = stockTextMatch[0].trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
return hints;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert availability hints to normalized availability
|
||||||
|
*/
|
||||||
|
export function hintsToAvailability(hints: AvailabilityHints): NormalizedAvailability {
|
||||||
|
let status: AvailabilityStatus = 'unknown';
|
||||||
|
let quantity: number | null = null;
|
||||||
|
|
||||||
|
// Extract quantity if present
|
||||||
|
if (hints.quantityText) {
|
||||||
|
const match = hints.quantityText.match(/(\d+)/);
|
||||||
|
if (match) {
|
||||||
|
quantity = parseInt(match[1], 10);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine status from hints
|
||||||
|
if (hints.hasOutOfStockBadge) {
|
||||||
|
status = 'out_of_stock';
|
||||||
|
} else if (hints.hasLimitedBadge) {
|
||||||
|
status = 'limited';
|
||||||
|
} else if (hints.hasInStockBadge) {
|
||||||
|
status = quantity !== null && quantity <= LIMITED_THRESHOLD ? 'limited' : 'in_stock';
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
status,
|
||||||
|
quantity,
|
||||||
|
raw: hints
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Aggregate availability counts from a list of products
|
||||||
|
*/
|
||||||
|
export interface AvailabilityCounts {
|
||||||
|
in_stock: number;
|
||||||
|
out_of_stock: number;
|
||||||
|
limited: number;
|
||||||
|
unknown: number;
|
||||||
|
changed: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function aggregateAvailability(
|
||||||
|
products: Array<{ availability_status?: AvailabilityStatus; previous_status?: AvailabilityStatus }>
|
||||||
|
): AvailabilityCounts {
|
||||||
|
const counts: AvailabilityCounts = {
|
||||||
|
in_stock: 0,
|
||||||
|
out_of_stock: 0,
|
||||||
|
limited: 0,
|
||||||
|
unknown: 0,
|
||||||
|
changed: 0
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const product of products) {
|
||||||
|
const status = product.availability_status || 'unknown';
|
||||||
|
counts[status]++;
|
||||||
|
|
||||||
|
if (product.previous_status && product.previous_status !== status) {
|
||||||
|
counts.changed++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return counts;
|
||||||
|
}
|
||||||
@@ -106,9 +106,10 @@ async function updateCategoryScanTime(
|
|||||||
}
|
}
|
||||||
|
|
||||||
async function getStoreIdForDispensary(dispensaryId: number): Promise<number | null> {
|
async function getStoreIdForDispensary(dispensaryId: number): Promise<number | null> {
|
||||||
|
// First check if dispensary has menu_url - if so, try to match with stores.dutchie_url
|
||||||
const result = await pool.query(
|
const result = await pool.query(
|
||||||
`SELECT s.id FROM stores s
|
`SELECT s.id FROM stores s
|
||||||
JOIN dispensaries d ON d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%'
|
JOIN dispensaries d ON d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%'
|
||||||
WHERE d.id = $1
|
WHERE d.id = $1
|
||||||
LIMIT 1`,
|
LIMIT 1`,
|
||||||
[dispensaryId]
|
[dispensaryId]
|
||||||
@@ -118,6 +119,7 @@ async function getStoreIdForDispensary(dispensaryId: number): Promise<number | n
|
|||||||
return result.rows[0].id;
|
return result.rows[0].id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Try matching by slug
|
||||||
const result2 = await pool.query(
|
const result2 = await pool.query(
|
||||||
`SELECT s.id FROM stores s
|
`SELECT s.id FROM stores s
|
||||||
JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
|
JOIN dispensaries d ON d.website ILIKE '%' || s.slug || '%'
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ import { scrapeStore } from '../scraper-v2';
|
|||||||
import puppeteer, { Browser, Page } from 'puppeteer';
|
import puppeteer, { Browser, Page } from 'puppeteer';
|
||||||
import { promises as fs } from 'fs';
|
import { promises as fs } from 'fs';
|
||||||
import path from 'path';
|
import path from 'path';
|
||||||
|
import { extractAvailabilityHints } from './availability';
|
||||||
|
|
||||||
const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
|
const WORKER_ID = `crawler-${process.pid}-${Date.now()}`;
|
||||||
|
|
||||||
@@ -500,9 +501,13 @@ export async function runSandboxCrawlJob(dispensaryId: number, sandboxId?: numbe
|
|||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Extract availability hints from page content
|
||||||
|
const availabilityHints = extractAvailabilityHints(html);
|
||||||
|
|
||||||
analysisData.page_structures.push({
|
analysisData.page_structures.push({
|
||||||
url,
|
url,
|
||||||
...structure,
|
...structure,
|
||||||
|
availabilityHints,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import { logger } from './logger';
|
|||||||
import { registerScraper, updateScraperStats, completeScraper } from '../routes/scraper-monitor';
|
import { registerScraper, updateScraperStats, completeScraper } from '../routes/scraper-monitor';
|
||||||
import { incrementProxyFailure, getActiveProxy, isBotDetectionError, putProxyInTimeout } from './proxy';
|
import { incrementProxyFailure, getActiveProxy, isBotDetectionError, putProxyInTimeout } from './proxy';
|
||||||
import { bypassAgeGate, detectStateFromUrl, setAgeGateCookies } from '../utils/age-gate';
|
import { bypassAgeGate, detectStateFromUrl, setAgeGateCookies } from '../utils/age-gate';
|
||||||
|
import { normalizeAvailability, AvailabilityStatus } from './availability';
|
||||||
|
|
||||||
// Apply stealth plugin for antidetect/anti-fingerprinting
|
// Apply stealth plugin for antidetect/anti-fingerprinting
|
||||||
puppeteer.use(StealthPlugin());
|
puppeteer.use(StealthPlugin());
|
||||||
@@ -35,6 +36,10 @@ interface Product {
|
|||||||
imageUrl?: string;
|
imageUrl?: string;
|
||||||
dutchieUrl: string;
|
dutchieUrl: string;
|
||||||
metadata?: any;
|
metadata?: any;
|
||||||
|
// Availability tracking
|
||||||
|
availabilityStatus?: AvailabilityStatus;
|
||||||
|
availabilityRaw?: any;
|
||||||
|
stockQuantity?: number | null;
|
||||||
}
|
}
|
||||||
|
|
||||||
export const USER_AGENTS = {
|
export const USER_AGENTS = {
|
||||||
@@ -584,6 +589,8 @@ export async function scrapeCategory(storeId: number, categoryId: number, userAg
|
|||||||
|
|
||||||
const formattedProducts: Product[] = products.map((p, index) => {
|
const formattedProducts: Product[] = products.map((p, index) => {
|
||||||
const sanitized = sanitizeProductData(p);
|
const sanitized = sanitizeProductData(p);
|
||||||
|
// Normalize availability from Dutchie product data
|
||||||
|
const availability = normalizeAvailability(p);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
|
dutchieProductId: `${category.store_slug}-${category.slug}-${Date.now()}-${index}`,
|
||||||
@@ -599,7 +606,10 @@ export async function scrapeCategory(storeId: number, categoryId: number, userAg
|
|||||||
weight: sanitized.weight,
|
weight: sanitized.weight,
|
||||||
imageUrl: p.imageUrl,
|
imageUrl: p.imageUrl,
|
||||||
dutchieUrl: p.href,
|
dutchieUrl: p.href,
|
||||||
metadata: p.metadata || {}
|
metadata: p.metadata || {},
|
||||||
|
availabilityStatus: availability.status,
|
||||||
|
availabilityRaw: availability.raw,
|
||||||
|
stockQuantity: availability.quantity
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -666,16 +676,28 @@ export async function saveProducts(storeId: number, categoryId: number, products
|
|||||||
|
|
||||||
logger.info('scraper', `Saving ${products.length} products to database...`);
|
logger.info('scraper', `Saving ${products.length} products to database...`);
|
||||||
|
|
||||||
|
// Mark all products as out-of-stock before processing (they'll be re-marked if found)
|
||||||
|
// Also update availability_status and last_seen_out_of_stock_at for state transition tracking
|
||||||
await client.query(`
|
await client.query(`
|
||||||
UPDATE products
|
UPDATE products
|
||||||
SET in_stock = false
|
SET in_stock = false,
|
||||||
WHERE store_id = $1 AND category_id = $2
|
availability_status = 'out_of_stock',
|
||||||
|
last_seen_out_of_stock_at = CASE
|
||||||
|
WHEN availability_status != 'out_of_stock' THEN CURRENT_TIMESTAMP
|
||||||
|
ELSE last_seen_out_of_stock_at
|
||||||
|
END
|
||||||
|
WHERE store_id = $1 AND category_id = $2 AND in_stock = true
|
||||||
`, [storeId, categoryId]);
|
`, [storeId, categoryId]);
|
||||||
|
|
||||||
for (const product of products) {
|
for (const product of products) {
|
||||||
try {
|
try {
|
||||||
|
// Get availability from product (defaults to in_stock if product exists in scraped data)
|
||||||
|
const availStatus = product.availabilityStatus || 'in_stock';
|
||||||
|
const availRaw = product.availabilityRaw ? JSON.stringify(product.availabilityRaw) : null;
|
||||||
|
const stockQty = product.stockQuantity ?? null;
|
||||||
|
|
||||||
const existingResult = await client.query(`
|
const existingResult = await client.query(`
|
||||||
SELECT id, image_url, local_image_path
|
SELECT id, image_url, local_image_path, availability_status
|
||||||
FROM products
|
FROM products
|
||||||
WHERE store_id = $1 AND name = $2 AND category_id = $3
|
WHERE store_id = $1 AND name = $2 AND category_id = $3
|
||||||
AND (variant = $4 OR (variant IS NULL AND $4 IS NULL))
|
AND (variant = $4 OR (variant IS NULL AND $4 IS NULL))
|
||||||
@@ -687,6 +709,11 @@ export async function saveProducts(storeId: number, categoryId: number, products
|
|||||||
if (existingResult.rows.length > 0) {
|
if (existingResult.rows.length > 0) {
|
||||||
productId = existingResult.rows[0].id;
|
productId = existingResult.rows[0].id;
|
||||||
localImagePath = existingResult.rows[0].local_image_path;
|
localImagePath = existingResult.rows[0].local_image_path;
|
||||||
|
const prevStatus = existingResult.rows[0].availability_status;
|
||||||
|
|
||||||
|
// Determine if we need to update last_seen_in_stock_at
|
||||||
|
const isNowInStock = availStatus === 'in_stock' || availStatus === 'limited';
|
||||||
|
const wasOutOfStock = prevStatus === 'out_of_stock' || prevStatus === 'unknown';
|
||||||
|
|
||||||
await client.query(`
|
await client.query(`
|
||||||
UPDATE products
|
UPDATE products
|
||||||
@@ -694,13 +721,21 @@ export async function saveProducts(storeId: number, categoryId: number, products
|
|||||||
strain_type = $5, thc_percentage = $6, cbd_percentage = $7,
|
strain_type = $5, thc_percentage = $6, cbd_percentage = $7,
|
||||||
brand = $8, weight = $9, image_url = $10, dutchie_url = $11,
|
brand = $8, weight = $9, image_url = $10, dutchie_url = $11,
|
||||||
in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP,
|
in_stock = true, metadata = $12, last_seen_at = CURRENT_TIMESTAMP,
|
||||||
updated_at = CURRENT_TIMESTAMP
|
updated_at = CURRENT_TIMESTAMP,
|
||||||
|
availability_status = $14,
|
||||||
|
availability_raw = $15,
|
||||||
|
stock_quantity = $16,
|
||||||
|
last_seen_in_stock_at = CASE
|
||||||
|
WHEN $17 THEN CURRENT_TIMESTAMP
|
||||||
|
ELSE last_seen_in_stock_at
|
||||||
|
END
|
||||||
WHERE id = $13
|
WHERE id = $13
|
||||||
`, [
|
`, [
|
||||||
product.name, product.variant, product.description, product.price,
|
product.name, product.variant, product.description, product.price,
|
||||||
product.strainType, product.thcPercentage, product.cbdPercentage,
|
product.strainType, product.thcPercentage, product.cbdPercentage,
|
||||||
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
||||||
JSON.stringify(product.metadata), productId
|
JSON.stringify(product.metadata), productId, availStatus, availRaw, stockQty,
|
||||||
|
isNowInStock && wasOutOfStock
|
||||||
]);
|
]);
|
||||||
} else {
|
} else {
|
||||||
// Generate unique slug from product name + timestamp + random suffix
|
// Generate unique slug from product name + timestamp + random suffix
|
||||||
@@ -716,14 +751,15 @@ export async function saveProducts(storeId: number, categoryId: number, products
|
|||||||
INSERT INTO products (
|
INSERT INTO products (
|
||||||
store_id, category_id, dutchie_product_id, name, slug, variant, description,
|
store_id, category_id, dutchie_product_id, name, slug, variant, description,
|
||||||
price, strain_type, thc_percentage, cbd_percentage,
|
price, strain_type, thc_percentage, cbd_percentage,
|
||||||
brand, weight, image_url, dutchie_url, in_stock, metadata
|
brand, weight, image_url, dutchie_url, in_stock, metadata,
|
||||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16)
|
availability_status, availability_raw, stock_quantity, last_seen_in_stock_at
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16, $17, $18, $19, CURRENT_TIMESTAMP)
|
||||||
RETURNING id
|
RETURNING id
|
||||||
`, [
|
`, [
|
||||||
storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description,
|
storeId, categoryId, product.dutchieProductId, product.name, slug, product.variant, product.description,
|
||||||
product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
|
product.price, product.strainType, product.thcPercentage, product.cbdPercentage,
|
||||||
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
product.brand, product.weight, product.imageUrl, product.dutchieUrl,
|
||||||
JSON.stringify(product.metadata)
|
JSON.stringify(product.metadata), availStatus, availRaw, stockQty
|
||||||
]);
|
]);
|
||||||
|
|
||||||
productId = insertResult.rows[0].id;
|
productId = insertResult.rows[0].id;
|
||||||
|
|||||||
1243
docs/CANNABRANDS_API_FRONTEND_SPEC.md
Normal file
1243
docs/CANNABRANDS_API_FRONTEND_SPEC.md
Normal file
File diff suppressed because it is too large
Load Diff
592
docs/CRAWL_OPERATIONS.md
Normal file
592
docs/CRAWL_OPERATIONS.md
Normal file
@@ -0,0 +1,592 @@
|
|||||||
|
# Crawl Operations & Data Philosophy
|
||||||
|
|
||||||
|
This document defines the operational constraints, scheduling requirements, and data integrity philosophy for the dispensary scraper system.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Frozen Crawler Policy
|
||||||
|
|
||||||
|
> **CRITICAL CONSTRAINT**: The crawler code is FROZEN. Do NOT modify any crawler logic.
|
||||||
|
|
||||||
|
### What Is Frozen
|
||||||
|
|
||||||
|
The following components are read-only and must not be modified:
|
||||||
|
|
||||||
|
- **Selectors**: All CSS/XPath selectors for extracting data from Dutchie pages
|
||||||
|
- **Parsing Logic**: Functions that transform raw HTML into structured data
|
||||||
|
- **Request Patterns**: URL construction, pagination, API calls to Dutchie
|
||||||
|
- **Browser Configuration**: Puppeteer settings, user agents, viewport sizes
|
||||||
|
- **Rate Limiting**: Request delays, retry logic, concurrent request limits
|
||||||
|
|
||||||
|
### What CAN Be Modified
|
||||||
|
|
||||||
|
You may build around the crawler's output:
|
||||||
|
|
||||||
|
| Layer | Allowed Changes |
|
||||||
|
|-------|-----------------|
|
||||||
|
| **Scheduling** | CronJobs, run frequency, store queuing |
|
||||||
|
| **Ingestion** | Post-processing of crawler output before DB insert |
|
||||||
|
| **API Layer** | Query logic, computed fields, response transformations |
|
||||||
|
| **Intelligence** | Aggregation tables, metrics computation |
|
||||||
|
| **Infrastructure** | K8s resources, scaling, monitoring |
|
||||||
|
|
||||||
|
### Rationale
|
||||||
|
|
||||||
|
The crawler has been stabilized through extensive testing. Changes to selectors or parsing risk:
|
||||||
|
- Breaking data extraction if Dutchie changes their UI
|
||||||
|
- Introducing regressions that are hard to detect
|
||||||
|
- Requiring re-validation across all store types
|
||||||
|
|
||||||
|
All improvements must happen in **downstream processing**, not in the crawler itself.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Crawl Scheduling
|
||||||
|
|
||||||
|
### Standard Schedule: Every 4 Hours
|
||||||
|
|
||||||
|
Run a full crawl for each store every 4 hours, 24/7.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# K8s CronJob: Every 4 hours
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: scraper-4h-cycle
|
||||||
|
namespace: dispensary-scraper
|
||||||
|
spec:
|
||||||
|
schedule: "0 */4 * * *" # 00:00, 04:00, 08:00, 12:00, 16:00, 20:00 UTC
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: scraper
|
||||||
|
image: code.cannabrands.app/creationshop/dispensary-scraper:latest
|
||||||
|
command: ["node", "dist/scripts/run-all-stores.js"]
|
||||||
|
env:
|
||||||
|
- name: DATABASE_URL
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: scraper-secrets
|
||||||
|
key: database-url
|
||||||
|
restartPolicy: OnFailure
|
||||||
|
```
|
||||||
|
|
||||||
|
### Daily Specials Crawl: 12:01 AM Store Local Time
|
||||||
|
|
||||||
|
Dispensaries often update their daily specials at midnight. We ensure a crawl happens at 12:01 AM in each store's local timezone.
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# K8s CronJob: Daily specials at store midnight (example for MST/Arizona)
|
||||||
|
apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: scraper-daily-specials-mst
|
||||||
|
namespace: dispensary-scraper
|
||||||
|
spec:
|
||||||
|
schedule: "1 7 * * *" # 12:01 AM MST = 07:01 UTC
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: scraper
|
||||||
|
image: code.cannabrands.app/creationshop/dispensary-scraper:latest
|
||||||
|
command: ["node", "dist/scripts/run-stores-by-timezone.js", "America/Phoenix"]
|
||||||
|
restartPolicy: OnFailure
|
||||||
|
```
|
||||||
|
|
||||||
|
### Timezone-Aware Scheduling
|
||||||
|
|
||||||
|
Stores table includes timezone information:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
ALTER TABLE stores ADD COLUMN IF NOT EXISTS timezone VARCHAR(50) DEFAULT 'America/Phoenix';
|
||||||
|
|
||||||
|
-- Lookup table for common dispensary timezones
|
||||||
|
-- America/Phoenix (Arizona, no DST)
|
||||||
|
-- America/Los_Angeles (California)
|
||||||
|
-- America/Denver (Colorado)
|
||||||
|
-- America/Chicago (Illinois)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scripts Required
|
||||||
|
|
||||||
|
```
|
||||||
|
/backend/src/scripts/
|
||||||
|
├── run-all-stores.ts # Run crawl for all enabled stores
|
||||||
|
├── run-stores-by-timezone.ts # Run crawl for stores in a specific timezone
|
||||||
|
└── scheduler.ts # Orchestrates CronJob dispatch
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Specials Detection Logic
|
||||||
|
|
||||||
|
> **Problem**: The Specials tab in the frontend is EMPTY even though products have discounts.
|
||||||
|
|
||||||
|
### Root Cause Analysis
|
||||||
|
|
||||||
|
Database investigation reveals:
|
||||||
|
|
||||||
|
| Metric | Count |
|
||||||
|
|--------|-------|
|
||||||
|
| Total products | 1,414 |
|
||||||
|
| `is_special = true` | 0 |
|
||||||
|
| Has "Special Offer" in name | 325 |
|
||||||
|
| Has `sale_price < regular_price` | 4 |
|
||||||
|
|
||||||
|
The crawler captures "Special Offer" **embedded in the product name** but doesn't set `is_special = true`.
|
||||||
|
|
||||||
|
### Solution: API-Layer Specials Detection
|
||||||
|
|
||||||
|
Since the crawler is frozen, detect specials at query time:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Computed is_on_special in API queries
|
||||||
|
SELECT
|
||||||
|
p.*,
|
||||||
|
CASE
|
||||||
|
WHEN p.name ILIKE '%Special Offer%' THEN TRUE
|
||||||
|
WHEN p.sale_price IS NOT NULL
|
||||||
|
AND p.regular_price IS NOT NULL
|
||||||
|
AND p.sale_price::numeric < p.regular_price::numeric THEN TRUE
|
||||||
|
WHEN p.price IS NOT NULL
|
||||||
|
AND p.original_price IS NOT NULL
|
||||||
|
AND p.price::numeric < p.original_price::numeric THEN TRUE
|
||||||
|
ELSE FALSE
|
||||||
|
END AS is_on_special,
|
||||||
|
|
||||||
|
-- Compute special type
|
||||||
|
CASE
|
||||||
|
WHEN p.name ILIKE '%Special Offer%' THEN 'special_offer'
|
||||||
|
WHEN p.sale_price IS NOT NULL
|
||||||
|
AND p.regular_price IS NOT NULL
|
||||||
|
AND p.sale_price::numeric < p.regular_price::numeric THEN 'percent_off'
|
||||||
|
ELSE NULL
|
||||||
|
END AS computed_special_type,
|
||||||
|
|
||||||
|
-- Compute discount percentage
|
||||||
|
CASE
|
||||||
|
WHEN p.sale_price IS NOT NULL
|
||||||
|
AND p.regular_price IS NOT NULL
|
||||||
|
AND p.regular_price::numeric > 0
|
||||||
|
THEN ROUND((1 - p.sale_price::numeric / p.regular_price::numeric) * 100, 0)
|
||||||
|
ELSE NULL
|
||||||
|
END AS computed_discount_percent
|
||||||
|
|
||||||
|
FROM products p
|
||||||
|
WHERE p.store_id = :store_id;
|
||||||
|
```
|
||||||
|
|
||||||
|
### Special Detection Rules (Priority Order)
|
||||||
|
|
||||||
|
1. **Name Contains "Special Offer"**: `name ILIKE '%Special Offer%'`
|
||||||
|
- Type: `special_offer`
|
||||||
|
- Badge: "Special"
|
||||||
|
|
||||||
|
2. **Price Discount (sale < regular)**: `sale_price < regular_price`
|
||||||
|
- Type: `percent_off`
|
||||||
|
- Badge: Computed as "X% OFF"
|
||||||
|
|
||||||
|
3. **Price Discount (current < original)**: `price < original_price`
|
||||||
|
- Type: `percent_off`
|
||||||
|
- Badge: Computed as "X% OFF"
|
||||||
|
|
||||||
|
4. **Metadata Offers** (future): `metadata->'offers' IS NOT NULL`
|
||||||
|
- Parse offer type from metadata JSON
|
||||||
|
|
||||||
|
### Clean Product Name
|
||||||
|
|
||||||
|
Strip "Special Offer" from display name:
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
function cleanProductName(rawName: string): string {
|
||||||
|
return rawName
|
||||||
|
.replace(/Special Offer$/i, '')
|
||||||
|
.replace(/\s+$/, '') // Trim trailing whitespace
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### API Specials Endpoint
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// GET /api/stores/:store_key/specials
|
||||||
|
async function getStoreSpecials(storeKey: string, options: SpecialsOptions) {
|
||||||
|
const query = `
|
||||||
|
WITH specials AS (
|
||||||
|
SELECT
|
||||||
|
p.*,
|
||||||
|
-- Detect special
|
||||||
|
CASE
|
||||||
|
WHEN p.name ILIKE '%Special Offer%' THEN TRUE
|
||||||
|
WHEN p.sale_price::numeric < p.regular_price::numeric THEN TRUE
|
||||||
|
ELSE FALSE
|
||||||
|
END AS is_on_special,
|
||||||
|
|
||||||
|
-- Compute discount
|
||||||
|
CASE
|
||||||
|
WHEN p.sale_price IS NOT NULL AND p.regular_price IS NOT NULL
|
||||||
|
THEN ROUND((1 - p.sale_price::numeric / p.regular_price::numeric) * 100)
|
||||||
|
ELSE NULL
|
||||||
|
END AS discount_percent
|
||||||
|
|
||||||
|
FROM products p
|
||||||
|
JOIN stores s ON p.store_id = s.id
|
||||||
|
WHERE s.store_key = $1
|
||||||
|
AND p.in_stock = TRUE
|
||||||
|
)
|
||||||
|
SELECT * FROM specials
|
||||||
|
WHERE is_on_special = TRUE
|
||||||
|
ORDER BY discount_percent DESC NULLS LAST
|
||||||
|
LIMIT $2 OFFSET $3
|
||||||
|
`;
|
||||||
|
|
||||||
|
return db.query(query, [storeKey, options.limit, options.offset]);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Append-Only Data Philosophy
|
||||||
|
|
||||||
|
> **Principle**: Every crawl should ADD information, never LOSE it.
|
||||||
|
|
||||||
|
### What Append-Only Means
|
||||||
|
|
||||||
|
| Action | Allowed | Not Allowed |
|
||||||
|
|--------|---------|-------------|
|
||||||
|
| Insert new product | ✅ | - |
|
||||||
|
| Update product price | ✅ | - |
|
||||||
|
| Mark product out-of-stock | ✅ | - |
|
||||||
|
| DELETE product row | ❌ | Never delete |
|
||||||
|
| TRUNCATE table | ❌ | Never truncate |
|
||||||
|
| UPDATE to remove data | ❌ | Never null-out existing data |
|
||||||
|
|
||||||
|
### Product Lifecycle States
|
||||||
|
|
||||||
|
```sql
|
||||||
|
-- Products are never deleted, only state changes
|
||||||
|
ALTER TABLE products ADD COLUMN IF NOT EXISTS status VARCHAR(20) DEFAULT 'active';
|
||||||
|
|
||||||
|
-- Statuses:
|
||||||
|
-- 'active' - Currently in stock or recently seen
|
||||||
|
-- 'out_of_stock' - Seen but marked out of stock
|
||||||
|
-- 'stale' - Not seen in last 3 crawls (likely discontinued)
|
||||||
|
-- 'archived' - Manually marked as discontinued
|
||||||
|
|
||||||
|
CREATE INDEX idx_products_status ON products(status);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Marking Products Stale (NOT Deleting)
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
// After crawl completes, mark unseen products as stale
|
||||||
|
async function markStaleProducts(storeId: number, crawlRunId: number) {
|
||||||
|
await db.query(`
|
||||||
|
UPDATE products
|
||||||
|
SET
|
||||||
|
status = 'stale',
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE store_id = $1
|
||||||
|
AND id NOT IN (
|
||||||
|
SELECT DISTINCT product_id
|
||||||
|
FROM store_product_snapshots
|
||||||
|
WHERE crawl_run_id = $2
|
||||||
|
)
|
||||||
|
AND status = 'active'
|
||||||
|
AND last_seen_at < NOW() - INTERVAL '3 days'
|
||||||
|
`, [storeId, crawlRunId]);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Store Product Snapshots: True Append-Only
|
||||||
|
|
||||||
|
The `store_product_snapshots` table is strictly append-only:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE store_product_snapshots (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
store_id INTEGER NOT NULL REFERENCES stores(id),
|
||||||
|
product_id INTEGER NOT NULL REFERENCES products(id),
|
||||||
|
crawl_run_id INTEGER NOT NULL REFERENCES crawl_runs(id),
|
||||||
|
|
||||||
|
-- Snapshot of data at crawl time
|
||||||
|
price_cents INTEGER,
|
||||||
|
regular_price_cents INTEGER,
|
||||||
|
sale_price_cents INTEGER,
|
||||||
|
in_stock BOOLEAN NOT NULL,
|
||||||
|
|
||||||
|
-- Computed at crawl time
|
||||||
|
is_on_special BOOLEAN NOT NULL DEFAULT FALSE,
|
||||||
|
special_type VARCHAR(50),
|
||||||
|
discount_percent INTEGER,
|
||||||
|
|
||||||
|
captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
|
||||||
|
-- Composite unique: one snapshot per product per crawl
|
||||||
|
CONSTRAINT uq_snapshot_product_crawl UNIQUE (product_id, crawl_run_id)
|
||||||
|
);
|
||||||
|
|
||||||
|
-- NO UPDATE or DELETE triggers - this table is INSERT-only
|
||||||
|
-- For data corrections, insert a new snapshot with corrected flag
|
||||||
|
|
||||||
|
CREATE INDEX idx_snapshots_crawl ON store_product_snapshots(crawl_run_id);
|
||||||
|
CREATE INDEX idx_snapshots_product_time ON store_product_snapshots(product_id, captured_at DESC);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Crawl Runs Table
|
||||||
|
|
||||||
|
Track every crawl execution:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE crawl_runs (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
store_id INTEGER NOT NULL REFERENCES stores(id),
|
||||||
|
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
||||||
|
completed_at TIMESTAMPTZ,
|
||||||
|
status VARCHAR(20) NOT NULL DEFAULT 'running',
|
||||||
|
products_found INTEGER,
|
||||||
|
products_new INTEGER,
|
||||||
|
products_updated INTEGER,
|
||||||
|
error_message TEXT,
|
||||||
|
|
||||||
|
-- Scheduling metadata
|
||||||
|
trigger_type VARCHAR(20) NOT NULL DEFAULT 'scheduled', -- 'scheduled', 'manual', 'daily_specials'
|
||||||
|
|
||||||
|
CONSTRAINT chk_crawl_status CHECK (status IN ('running', 'completed', 'failed'))
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX idx_crawl_runs_store_time ON crawl_runs(store_id, started_at DESC);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Data Correction Pattern
|
||||||
|
|
||||||
|
If data needs correction, don't UPDATE - insert a correction record:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE data_corrections (
|
||||||
|
id SERIAL PRIMARY KEY,
|
||||||
|
table_name VARCHAR(50) NOT NULL,
|
||||||
|
record_id INTEGER NOT NULL,
|
||||||
|
field_name VARCHAR(100) NOT NULL,
|
||||||
|
old_value JSONB,
|
||||||
|
new_value JSONB,
|
||||||
|
reason TEXT NOT NULL,
|
||||||
|
corrected_by VARCHAR(100) NOT NULL,
|
||||||
|
corrected_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Safe Ingestion Patterns
|
||||||
|
|
||||||
|
### Upsert Products (Preserving History)
|
||||||
|
|
||||||
|
```typescript
|
||||||
|
async function upsertProduct(storeId: number, crawlRunId: number, product: ScrapedProduct) {
|
||||||
|
// 1. Find or create product
|
||||||
|
const existing = await db.query(
|
||||||
|
`SELECT id, price, regular_price, sale_price FROM products
|
||||||
|
WHERE store_id = $1 AND dutchie_product_id = $2`,
|
||||||
|
[storeId, product.dutchieId]
|
||||||
|
);
|
||||||
|
|
||||||
|
let productId: number;
|
||||||
|
|
||||||
|
if (existing.rows.length === 0) {
|
||||||
|
// INSERT new product
|
||||||
|
const result = await db.query(`
|
||||||
|
INSERT INTO products (
|
||||||
|
store_id, dutchie_product_id, name, slug, price, regular_price, sale_price,
|
||||||
|
in_stock, first_seen_at, last_seen_at, status
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, TRUE, NOW(), NOW(), 'active')
|
||||||
|
RETURNING id
|
||||||
|
`, [storeId, product.dutchieId, product.name, product.slug,
|
||||||
|
product.price, product.regularPrice, product.salePrice]);
|
||||||
|
productId = result.rows[0].id;
|
||||||
|
} else {
|
||||||
|
// UPDATE existing - only update if values changed, never null-out
|
||||||
|
productId = existing.rows[0].id;
|
||||||
|
await db.query(`
|
||||||
|
UPDATE products SET
|
||||||
|
name = COALESCE($2, name),
|
||||||
|
price = COALESCE($3, price),
|
||||||
|
regular_price = COALESCE($4, regular_price),
|
||||||
|
sale_price = COALESCE($5, sale_price),
|
||||||
|
in_stock = TRUE,
|
||||||
|
last_seen_at = NOW(),
|
||||||
|
status = 'active',
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = $1
|
||||||
|
`, [productId, product.name, product.price, product.regularPrice, product.salePrice]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Always create snapshot (append-only)
|
||||||
|
const isOnSpecial = detectSpecial(product);
|
||||||
|
const discountPercent = computeDiscount(product);
|
||||||
|
|
||||||
|
await db.query(`
|
||||||
|
INSERT INTO store_product_snapshots (
|
||||||
|
store_id, product_id, crawl_run_id,
|
||||||
|
price_cents, regular_price_cents, sale_price_cents,
|
||||||
|
in_stock, is_on_special, special_type, discount_percent
|
||||||
|
) VALUES ($1, $2, $3, $4, $5, $6, TRUE, $7, $8, $9)
|
||||||
|
ON CONFLICT (product_id, crawl_run_id) DO NOTHING
|
||||||
|
`, [
|
||||||
|
storeId, productId, crawlRunId,
|
||||||
|
toCents(product.price), toCents(product.regularPrice), toCents(product.salePrice),
|
||||||
|
isOnSpecial, isOnSpecial ? 'percent_off' : null, discountPercent
|
||||||
|
]);
|
||||||
|
|
||||||
|
return productId;
|
||||||
|
}
|
||||||
|
|
||||||
|
function detectSpecial(product: ScrapedProduct): boolean {
|
||||||
|
// Check name for "Special Offer"
|
||||||
|
if (product.name?.includes('Special Offer')) return true;
|
||||||
|
|
||||||
|
// Check price discount
|
||||||
|
if (product.salePrice && product.regularPrice) {
|
||||||
|
return parseFloat(product.salePrice) < parseFloat(product.regularPrice);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
function computeDiscount(product: ScrapedProduct): number | null {
|
||||||
|
if (!product.salePrice || !product.regularPrice) return null;
|
||||||
|
|
||||||
|
const sale = parseFloat(product.salePrice);
|
||||||
|
const regular = parseFloat(product.regularPrice);
|
||||||
|
|
||||||
|
if (regular <= 0) return null;
|
||||||
|
|
||||||
|
return Math.round((1 - sale / regular) * 100);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. K8s Deployment Configuration
|
||||||
|
|
||||||
|
### CronJobs Overview
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# All CronJobs for scheduling
|
||||||
|
apiVersion: v1
|
||||||
|
kind: List
|
||||||
|
items:
|
||||||
|
# 1. Standard 4-hour crawl cycle
|
||||||
|
- apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: scraper-4h-00
|
||||||
|
namespace: dispensary-scraper
|
||||||
|
spec:
|
||||||
|
schedule: "0 0,4,8,12,16,20 * * *"
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
successfulJobsHistoryLimit: 3
|
||||||
|
failedJobsHistoryLimit: 3
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
activeDeadlineSeconds: 3600 # 1 hour timeout
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: scraper
|
||||||
|
image: code.cannabrands.app/creationshop/dispensary-scraper:latest
|
||||||
|
command: ["node", "dist/scripts/run-all-stores.js"]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: "512Mi"
|
||||||
|
cpu: "250m"
|
||||||
|
limits:
|
||||||
|
memory: "2Gi"
|
||||||
|
cpu: "1000m"
|
||||||
|
restartPolicy: OnFailure
|
||||||
|
|
||||||
|
# 2. Daily specials crawl - Arizona (MST, no DST)
|
||||||
|
- apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: scraper-daily-mst
|
||||||
|
namespace: dispensary-scraper
|
||||||
|
spec:
|
||||||
|
schedule: "1 7 * * *" # 12:01 AM MST = 07:01 UTC
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: scraper
|
||||||
|
command: ["node", "dist/scripts/run-stores-by-timezone.js", "America/Phoenix"]
|
||||||
|
|
||||||
|
# 3. Daily specials crawl - California (PST/PDT)
|
||||||
|
- apiVersion: batch/v1
|
||||||
|
kind: CronJob
|
||||||
|
metadata:
|
||||||
|
name: scraper-daily-pst
|
||||||
|
namespace: dispensary-scraper
|
||||||
|
spec:
|
||||||
|
schedule: "1 8 * * *" # 12:01 AM PST = 08:01 UTC (adjust for DST)
|
||||||
|
concurrencyPolicy: Forbid
|
||||||
|
jobTemplate:
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: scraper
|
||||||
|
command: ["node", "dist/scripts/run-stores-by-timezone.js", "America/Los_Angeles"]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Monitoring and Alerts
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# PrometheusRule for scraper monitoring
|
||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: PrometheusRule
|
||||||
|
metadata:
|
||||||
|
name: scraper-alerts
|
||||||
|
namespace: dispensary-scraper
|
||||||
|
spec:
|
||||||
|
groups:
|
||||||
|
- name: scraper.rules
|
||||||
|
rules:
|
||||||
|
- alert: ScraperJobFailed
|
||||||
|
expr: kube_job_status_failed{namespace="dispensary-scraper"} > 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Scraper job failed"
|
||||||
|
|
||||||
|
- alert: ScraperMissedSchedule
|
||||||
|
expr: time() - kube_cronjob_status_last_successful_time{namespace="dispensary-scraper"} > 18000
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Scraper hasn't run successfully in 5+ hours"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Summary
|
||||||
|
|
||||||
|
| Constraint | Implementation |
|
||||||
|
|------------|----------------|
|
||||||
|
| **Frozen Crawler** | No changes to selectors, parsing, or request logic |
|
||||||
|
| **4-Hour Schedule** | K8s CronJob at 0,4,8,12,16,20 UTC |
|
||||||
|
| **12:01 AM Specials** | Timezone-specific CronJobs for store local midnight |
|
||||||
|
| **Specials Detection** | API-layer detection via name pattern + price comparison |
|
||||||
|
| **Append-Only Data** | Never DELETE; use status flags; `store_product_snapshots` is INSERT-only |
|
||||||
|
| **Historical Preservation** | All crawls create snapshots; stale products marked, never deleted |
|
||||||
|
|
||||||
|
This design ensures we maximize the value of crawler data without risking breakage from crawler modifications.
|
||||||
2004
docs/PRODUCT_BRAND_INTELLIGENCE_ARCHITECTURE.md
Normal file
2004
docs/PRODUCT_BRAND_INTELLIGENCE_ARCHITECTURE.md
Normal file
File diff suppressed because it is too large
Load Diff
2027
docs/PRODUCT_BRAND_INTELLIGENCE_FINAL.md
Normal file
2027
docs/PRODUCT_BRAND_INTELLIGENCE_FINAL.md
Normal file
File diff suppressed because it is too large
Load Diff
1958
docs/STORE_API_SPECIFICATION.md
Normal file
1958
docs/STORE_API_SPECIFICATION.md
Normal file
File diff suppressed because it is too large
Load Diff
2994
docs/WORDPRESS_PLUGIN_SPEC.md
Normal file
2994
docs/WORDPRESS_PLUGIN_SPEC.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -2,7 +2,10 @@ import { useEffect, useState } from 'react';
|
|||||||
import { useParams, useNavigate } from 'react-router-dom';
|
import { useParams, useNavigate } from 'react-router-dom';
|
||||||
import { Layout } from '../components/Layout';
|
import { Layout } from '../components/Layout';
|
||||||
import { api } from '../lib/api';
|
import { api } from '../lib/api';
|
||||||
import { Package, Tag, Zap, TrendingUp, Calendar, DollarSign } from 'lucide-react';
|
import {
|
||||||
|
Package, Tag, Zap, Clock, ExternalLink, CheckCircle, XCircle,
|
||||||
|
AlertCircle, Building, MapPin, RefreshCw, Calendar, Activity
|
||||||
|
} from 'lucide-react';
|
||||||
|
|
||||||
export function StoreDetail() {
|
export function StoreDetail() {
|
||||||
const { slug } = useParams();
|
const { slug } = useParams();
|
||||||
@@ -14,7 +17,7 @@ export function StoreDetail() {
|
|||||||
const [loading, setLoading] = useState(true);
|
const [loading, setLoading] = useState(true);
|
||||||
const [selectedCategory, setSelectedCategory] = useState<number | null>(null);
|
const [selectedCategory, setSelectedCategory] = useState<number | null>(null);
|
||||||
const [selectedBrand, setSelectedBrand] = useState<string>('');
|
const [selectedBrand, setSelectedBrand] = useState<string>('');
|
||||||
const [view, setView] = useState<'products' | 'brands' | 'specials'>('products');
|
const [view, setView] = useState<'products' | 'brands' | 'specials' | 'crawl-history'>('products');
|
||||||
const [sortBy, setSortBy] = useState('name');
|
const [sortBy, setSortBy] = useState('name');
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
@@ -30,19 +33,22 @@ export function StoreDetail() {
|
|||||||
const loadStoreData = async () => {
|
const loadStoreData = async () => {
|
||||||
setLoading(true);
|
setLoading(true);
|
||||||
try {
|
try {
|
||||||
|
// First, find store by slug to get its ID
|
||||||
const allStores = await api.getStores();
|
const allStores = await api.getStores();
|
||||||
const storeData = allStores.stores.find((s: any) => s.slug === slug);
|
const basicStore = allStores.stores.find((s: any) => s.slug === slug);
|
||||||
|
|
||||||
if (!storeData) {
|
if (!basicStore) {
|
||||||
throw new Error('Store not found');
|
throw new Error('Store not found');
|
||||||
}
|
}
|
||||||
|
|
||||||
const [categoriesData, brandsData] = await Promise.all([
|
// Fetch full store details using the enhanced endpoint
|
||||||
api.getCategories(storeData.id),
|
const [fullStoreData, categoriesData, brandsData] = await Promise.all([
|
||||||
api.getStoreBrands(storeData.id)
|
api.getStore(basicStore.id),
|
||||||
|
api.getCategories(basicStore.id),
|
||||||
|
api.getStoreBrands(basicStore.id)
|
||||||
]);
|
]);
|
||||||
|
|
||||||
setStore(storeData);
|
setStore(fullStoreData);
|
||||||
setCategories(categoriesData.categories || []);
|
setCategories(categoriesData.categories || []);
|
||||||
setBrands(brandsData.brands || []);
|
setBrands(brandsData.brands || []);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@@ -101,6 +107,43 @@ export function StoreDetail() {
|
|||||||
return 'https://via.placeholder.com/300x300?text=No+Image';
|
return 'https://via.placeholder.com/300x300?text=No+Image';
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const formatDate = (dateString: string | null) => {
|
||||||
|
if (!dateString) return 'Never';
|
||||||
|
return new Date(dateString).toLocaleString('en-US', {
|
||||||
|
month: 'short',
|
||||||
|
day: 'numeric',
|
||||||
|
year: 'numeric',
|
||||||
|
hour: '2-digit',
|
||||||
|
minute: '2-digit'
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
const getProviderBadgeColor = (provider: string) => {
|
||||||
|
switch (provider?.toLowerCase()) {
|
||||||
|
case 'dutchie': return 'bg-green-100 text-green-700';
|
||||||
|
case 'jane': return 'bg-purple-100 text-purple-700';
|
||||||
|
case 'treez': return 'bg-blue-100 text-blue-700';
|
||||||
|
case 'weedmaps': return 'bg-orange-100 text-orange-700';
|
||||||
|
case 'leafly': return 'bg-emerald-100 text-emerald-700';
|
||||||
|
default: return 'bg-gray-100 text-gray-700';
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const getJobStatusBadge = (status: string) => {
|
||||||
|
switch (status) {
|
||||||
|
case 'completed':
|
||||||
|
return <span className="px-2 py-1 text-xs font-medium bg-green-100 text-green-700 rounded-full flex items-center gap-1"><CheckCircle className="w-3 h-3" /> Completed</span>;
|
||||||
|
case 'running':
|
||||||
|
return <span className="px-2 py-1 text-xs font-medium bg-blue-100 text-blue-700 rounded-full flex items-center gap-1"><RefreshCw className="w-3 h-3 animate-spin" /> Running</span>;
|
||||||
|
case 'failed':
|
||||||
|
return <span className="px-2 py-1 text-xs font-medium bg-red-100 text-red-700 rounded-full flex items-center gap-1"><XCircle className="w-3 h-3" /> Failed</span>;
|
||||||
|
case 'pending':
|
||||||
|
return <span className="px-2 py-1 text-xs font-medium bg-yellow-100 text-yellow-700 rounded-full flex items-center gap-1"><Clock className="w-3 h-3" /> Pending</span>;
|
||||||
|
default:
|
||||||
|
return <span className="px-2 py-1 text-xs font-medium bg-gray-100 text-gray-700 rounded-full">{status}</span>;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if (loading) {
|
if (loading) {
|
||||||
return (
|
return (
|
||||||
<Layout>
|
<Layout>
|
||||||
@@ -127,33 +170,112 @@ export function StoreDetail() {
|
|||||||
return (
|
return (
|
||||||
<Layout>
|
<Layout>
|
||||||
<div className="space-y-6">
|
<div className="space-y-6">
|
||||||
{/* Header */}
|
{/* Header with Store Info */}
|
||||||
<div className="bg-white rounded-xl border border-gray-200 p-6">
|
<div className="bg-white rounded-xl border border-gray-200 p-6">
|
||||||
<div className="flex items-center justify-between mb-4">
|
<div className="flex items-start justify-between mb-6">
|
||||||
<div className="flex items-center gap-4">
|
<div className="flex items-start gap-4">
|
||||||
<button
|
<button
|
||||||
onClick={() => navigate('/stores')}
|
onClick={() => navigate('/stores')}
|
||||||
className="text-gray-600 hover:text-gray-900"
|
className="text-gray-600 hover:text-gray-900 mt-1"
|
||||||
>
|
>
|
||||||
← Back
|
← Back
|
||||||
</button>
|
</button>
|
||||||
<div>
|
<div>
|
||||||
<h1 className="text-2xl font-semibold text-gray-900">{store.name}</h1>
|
<div className="flex items-center gap-3">
|
||||||
<p className="text-sm text-gray-500 mt-1">
|
<h1 className="text-2xl font-semibold text-gray-900">{store.name}</h1>
|
||||||
{products.length} products • {categories.length} categories • {brands.length} brands
|
<span className={`px-2 py-1 text-xs font-medium rounded ${getProviderBadgeColor(store.provider)}`}>
|
||||||
</p>
|
{store.provider || 'Unknown'}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<p className="text-sm text-gray-500 mt-1">Store ID: {store.id}</p>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<a
|
<a
|
||||||
href={store.dutchie_url}
|
href={store.dutchie_url}
|
||||||
target="_blank"
|
target="_blank"
|
||||||
rel="noopener noreferrer"
|
rel="noopener noreferrer"
|
||||||
className="text-sm text-blue-600 hover:text-blue-700"
|
className="flex items-center gap-1 text-sm text-blue-600 hover:text-blue-700"
|
||||||
>
|
>
|
||||||
View on Dutchie →
|
View Menu <ExternalLink className="w-4 h-4" />
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Stats Row */}
|
||||||
|
<div className="grid grid-cols-2 md:grid-cols-4 lg:grid-cols-6 gap-4 mb-6">
|
||||||
|
<div className="p-4 bg-gray-50 rounded-lg">
|
||||||
|
<div className="flex items-center gap-2 text-gray-500 text-xs mb-1">
|
||||||
|
<Package className="w-4 h-4" />
|
||||||
|
Products
|
||||||
|
</div>
|
||||||
|
<p className="text-xl font-semibold text-gray-900">{store.product_count || 0}</p>
|
||||||
|
</div>
|
||||||
|
<div className="p-4 bg-gray-50 rounded-lg">
|
||||||
|
<div className="flex items-center gap-2 text-gray-500 text-xs mb-1">
|
||||||
|
<Tag className="w-4 h-4" />
|
||||||
|
Categories
|
||||||
|
</div>
|
||||||
|
<p className="text-xl font-semibold text-gray-900">{store.category_count || 0}</p>
|
||||||
|
</div>
|
||||||
|
<div className="p-4 bg-green-50 rounded-lg">
|
||||||
|
<div className="flex items-center gap-2 text-green-600 text-xs mb-1">
|
||||||
|
<CheckCircle className="w-4 h-4" />
|
||||||
|
In Stock
|
||||||
|
</div>
|
||||||
|
<p className="text-xl font-semibold text-green-700">{store.in_stock_count || 0}</p>
|
||||||
|
</div>
|
||||||
|
<div className="p-4 bg-red-50 rounded-lg">
|
||||||
|
<div className="flex items-center gap-2 text-red-600 text-xs mb-1">
|
||||||
|
<XCircle className="w-4 h-4" />
|
||||||
|
Out of Stock
|
||||||
|
</div>
|
||||||
|
<p className="text-xl font-semibold text-red-700">{store.out_of_stock_count || 0}</p>
|
||||||
|
</div>
|
||||||
|
<div className={`p-4 rounded-lg ${store.is_stale ? 'bg-yellow-50' : 'bg-blue-50'}`}>
|
||||||
|
<div className={`flex items-center gap-2 text-xs mb-1 ${store.is_stale ? 'text-yellow-600' : 'text-blue-600'}`}>
|
||||||
|
<Clock className="w-4 h-4" />
|
||||||
|
Freshness
|
||||||
|
</div>
|
||||||
|
<p className={`text-sm font-semibold ${store.is_stale ? 'text-yellow-700' : 'text-blue-700'}`}>
|
||||||
|
{store.freshness || 'Never scraped'}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<div className="p-4 bg-gray-50 rounded-lg">
|
||||||
|
<div className="flex items-center gap-2 text-gray-500 text-xs mb-1">
|
||||||
|
<Calendar className="w-4 h-4" />
|
||||||
|
Next Crawl
|
||||||
|
</div>
|
||||||
|
<p className="text-sm font-semibold text-gray-700">
|
||||||
|
{store.schedule?.next_run_at ? formatDate(store.schedule.next_run_at) : 'Not scheduled'}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
{/* Linked Dispensary */}
|
||||||
|
{store.linked_dispensary && (
|
||||||
|
<div className="p-4 bg-indigo-50 rounded-lg mb-6">
|
||||||
|
<div className="flex items-center gap-2 text-indigo-600 text-xs mb-2">
|
||||||
|
<Building className="w-4 h-4" />
|
||||||
|
Linked Dispensary
|
||||||
|
</div>
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<div>
|
||||||
|
<p className="font-semibold text-indigo-900">{store.linked_dispensary.name}</p>
|
||||||
|
<p className="text-sm text-indigo-700 flex items-center gap-1">
|
||||||
|
<MapPin className="w-3 h-3" />
|
||||||
|
{store.linked_dispensary.city}, {store.linked_dispensary.state}
|
||||||
|
{store.linked_dispensary.address && ` - ${store.linked_dispensary.address}`}
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<button
|
||||||
|
onClick={() => navigate(`/dispensaries/${store.linked_dispensary.slug}`)}
|
||||||
|
className="text-sm text-indigo-600 hover:text-indigo-700 font-medium"
|
||||||
|
>
|
||||||
|
View Dispensary →
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* View Tabs */}
|
{/* View Tabs */}
|
||||||
<div className="flex gap-2 border-b border-gray-200">
|
<div className="flex gap-2 border-b border-gray-200">
|
||||||
<button
|
<button
|
||||||
@@ -195,9 +317,75 @@ export function StoreDetail() {
|
|||||||
Specials
|
Specials
|
||||||
</div>
|
</div>
|
||||||
</button>
|
</button>
|
||||||
|
<button
|
||||||
|
onClick={() => setView('crawl-history')}
|
||||||
|
className={`px-4 py-2 border-b-2 transition-colors ${
|
||||||
|
view === 'crawl-history'
|
||||||
|
? 'border-blue-600 text-blue-600 font-medium'
|
||||||
|
: 'border-transparent text-gray-600 hover:text-gray-900'
|
||||||
|
}`}
|
||||||
|
>
|
||||||
|
<div className="flex items-center gap-2">
|
||||||
|
<Activity className="w-4 h-4" />
|
||||||
|
Crawl History ({store.recent_jobs?.length || 0})
|
||||||
|
</div>
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Crawl History View */}
|
||||||
|
{view === 'crawl-history' && (
|
||||||
|
<div className="bg-white rounded-xl border border-gray-200 overflow-hidden">
|
||||||
|
<div className="p-4 border-b border-gray-200">
|
||||||
|
<h2 className="text-lg font-semibold text-gray-900">Recent Crawl Jobs</h2>
|
||||||
|
<p className="text-sm text-gray-500">Last 10 crawl jobs for this store</p>
|
||||||
|
</div>
|
||||||
|
{store.recent_jobs && store.recent_jobs.length > 0 ? (
|
||||||
|
<div className="overflow-x-auto">
|
||||||
|
<table className="w-full">
|
||||||
|
<thead className="bg-gray-50">
|
||||||
|
<tr>
|
||||||
|
<th className="px-4 py-3 text-left text-xs font-medium text-gray-500 uppercase">Status</th>
|
||||||
|
<th className="px-4 py-3 text-left text-xs font-medium text-gray-500 uppercase">Type</th>
|
||||||
|
<th className="px-4 py-3 text-left text-xs font-medium text-gray-500 uppercase">Started</th>
|
||||||
|
<th className="px-4 py-3 text-left text-xs font-medium text-gray-500 uppercase">Completed</th>
|
||||||
|
<th className="px-4 py-3 text-center text-xs font-medium text-gray-500 uppercase">Found</th>
|
||||||
|
<th className="px-4 py-3 text-center text-xs font-medium text-gray-500 uppercase">New</th>
|
||||||
|
<th className="px-4 py-3 text-center text-xs font-medium text-gray-500 uppercase">Updated</th>
|
||||||
|
<th className="px-4 py-3 text-center text-xs font-medium text-gray-500 uppercase">In Stock</th>
|
||||||
|
<th className="px-4 py-3 text-center text-xs font-medium text-gray-500 uppercase">Out of Stock</th>
|
||||||
|
<th className="px-4 py-3 text-left text-xs font-medium text-gray-500 uppercase">Error</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody className="divide-y divide-gray-100">
|
||||||
|
{store.recent_jobs.map((job: any) => (
|
||||||
|
<tr key={job.id} className="hover:bg-gray-50">
|
||||||
|
<td className="px-4 py-3">{getJobStatusBadge(job.status)}</td>
|
||||||
|
<td className="px-4 py-3 text-sm text-gray-700">{job.job_type || '-'}</td>
|
||||||
|
<td className="px-4 py-3 text-sm text-gray-700">{formatDate(job.started_at)}</td>
|
||||||
|
<td className="px-4 py-3 text-sm text-gray-700">{formatDate(job.completed_at)}</td>
|
||||||
|
<td className="px-4 py-3 text-center text-sm font-medium text-gray-900">{job.products_found ?? '-'}</td>
|
||||||
|
<td className="px-4 py-3 text-center text-sm font-medium text-green-600">{job.products_new ?? '-'}</td>
|
||||||
|
<td className="px-4 py-3 text-center text-sm font-medium text-blue-600">{job.products_updated ?? '-'}</td>
|
||||||
|
<td className="px-4 py-3 text-center text-sm font-medium text-green-600">{job.in_stock_count ?? '-'}</td>
|
||||||
|
<td className="px-4 py-3 text-center text-sm font-medium text-red-600">{job.out_of_stock_count ?? '-'}</td>
|
||||||
|
<td className="px-4 py-3 text-sm text-red-600 max-w-xs truncate" title={job.error_message || ''}>
|
||||||
|
{job.error_message || '-'}
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
))}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
) : (
|
||||||
|
<div className="text-center py-12">
|
||||||
|
<Activity className="w-16 h-16 text-gray-300 mx-auto mb-4" />
|
||||||
|
<p className="text-gray-500">No crawl history available</p>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Products View */}
|
{/* Products View */}
|
||||||
{view === 'products' && (
|
{view === 'products' && (
|
||||||
<>
|
<>
|
||||||
|
|||||||
Reference in New Issue
Block a user