From 04b5c3bd095b41b886fe0c967b39517a3e545cf1 Mon Sep 17 00:00:00 2001 From: Kelly Date: Tue, 2 Dec 2025 13:28:23 -0700 Subject: [PATCH] Add CLAUDE guidelines for consolidated pipeline --- CLAUDE.md | 40 + ...dd_dispensary_id_to_wp_api_permissions.sql | 13 + backend/migrations/031_consolidate_schema.sql | 1 + .../031_product_normalized_fields.sql | 36 + .../032_menu_type_and_local_images.sql | 61 ++ .../033_add_platform_id_to_crawl_status.sql | 63 ++ backend/src/db/migrate.ts | 10 +- backend/src/dutchie-az/services/discovery.ts | 6 +- .../dutchie-az/services/product-crawler.ts | 98 ++- backend/src/dutchie-az/services/scheduler.ts | 2 +- backend/src/index.ts | 2 + backend/src/routes/dashboard.ts | 17 +- backend/src/routes/dispensaries.ts | 132 +++- backend/src/routes/schedule.ts | 403 +++++++++- backend/src/scraper-v2/engine.ts | 46 +- backend/src/scraper-v2/index.ts | 7 + backend/src/scraper-v2/pipelines.ts | 223 +++++- .../src/scrapers/dutchie-graphql-direct.ts | 439 +++++++++++ backend/src/scrapers/dutchie-graphql.ts | 711 ++++++++++++++++++ backend/src/scrapers/templates/dutchie.ts | 11 +- backend/src/scripts/capture-dutchie-schema.ts | 236 ++++++ backend/src/scripts/crawl-all-dutchie.ts | 66 ++ backend/src/scripts/run-dutchie-scrape.ts | 139 ++++ backend/src/scripts/scrape-all-active.ts | 319 ++++++++ backend/src/scripts/test-dutchie-e2e.ts | 156 ++++ backend/src/scripts/test-dutchie-graphql.ts | 233 ++++++ backend/src/scripts/test-status-filter.ts | 106 +++ .../src/services/store-crawl-orchestrator.ts | 78 +- backend/src/utils/image-storage.ts | 322 ++++++++ backend/src/utils/product-normalizer.ts | 206 +++++ frontend/src/lib/api.ts | 65 +- frontend/src/pages/ScraperSchedule.tsx | 407 ++++++++-- 32 files changed, 4485 insertions(+), 169 deletions(-) create mode 100644 CLAUDE.md create mode 100644 backend/migrations/030_add_dispensary_id_to_wp_api_permissions.sql create mode 100644 backend/migrations/031_product_normalized_fields.sql create mode 100644 backend/migrations/032_menu_type_and_local_images.sql create mode 100644 backend/migrations/033_add_platform_id_to_crawl_status.sql create mode 100644 backend/src/scrapers/dutchie-graphql-direct.ts create mode 100644 backend/src/scrapers/dutchie-graphql.ts create mode 100644 backend/src/scripts/capture-dutchie-schema.ts create mode 100644 backend/src/scripts/crawl-all-dutchie.ts create mode 100644 backend/src/scripts/run-dutchie-scrape.ts create mode 100644 backend/src/scripts/scrape-all-active.ts create mode 100644 backend/src/scripts/test-dutchie-e2e.ts create mode 100644 backend/src/scripts/test-dutchie-graphql.ts create mode 100644 backend/src/scripts/test-status-filter.ts create mode 100644 backend/src/utils/image-storage.ts create mode 100644 backend/src/utils/product-normalizer.ts diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..b8a36307 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,40 @@ +## Claude Guidelines for this Project + +1) **Use the consolidated DB everywhere** + - Preferred env: `CRAWLSY_DATABASE_URL` (fallback `DATABASE_URL`). + - Do NOT create dutchie tables in the legacy DB. Apply migrations 031/032/033 to the consolidated DB and restart. + +2) **Dispensary vs Store** + - Dutchie pipeline uses `dispensaries` (not legacy `stores`). For dutchie crawls, always work with dispensary ID. + - Ignore legacy fields like `dutchie_plus_id` and slug guessing. Use the record’s `menu_url` and `platform_dispensary_id`. + +3) **Menu detection and platform IDs** + - Set `menu_type` from `menu_url` detection; resolve `platform_dispensary_id` for `menu_type='dutchie'`. + - Admin should have “refresh detection” and “resolve ID” actions; schedule/crawl only when `menu_type='dutchie'` AND `platform_dispensary_id` is set. + +4) **Queries and mapping** + - The DB returns snake_case; code expects camelCase. Always alias/map: + - `platform_dispensary_id AS "platformDispensaryId"` + - Map via `mapDbRowToDispensary` when loading dispensaries (scheduler, crawler, admin crawl). + - Avoid `SELECT *`; explicitly select and/or map fields. + +5) **Scheduling** + - `/scraper-schedule` should accept filters/search (All vs AZ-only, name). + - “Run Now”/scheduler must skip or warn if `menu_type!='dutchie'` or `platform_dispensary_id` missing. + - Use `dispensary_crawl_status` view; show reason when not crawlable. + +6) **Crawling** + - Trigger dutchie crawls by dispensary ID (e.g., `/api/az/admin/crawl/:id` or `runDispensaryOrchestrator(id)`). + - Update existing products (by stable product ID), append snapshots for history (every 4h cadence), download images locally (`/images/...`), store local URLs. + - Use dutchie GraphQL pipeline only for `menu_type='dutchie'`. + +7) **Frontend** + - Forward-facing URLs: `/api/az`, `/az`, `/az-schedule`; no vendor names. + - `/scraper-schedule`: add filters/search, keep as master view for all schedules; reflect platform ID/menu_type status and controls (resolve ID, run now, enable/disable/delete). + +8) **No slug guessing** + - Do not guess slugs; use the DB record’s `menu_url` and ID. Resolve platform ID from the URL/cName; if set, crawl directly by ID. + +9) **Verify locally before pushing** + - Apply migrations, restart backend, ensure auth (`users` table) exists, run dutchie crawl for a known dispensary (e.g., Deeply Rooted), check `/api/az/dashboard`, `/api/az/stores/:id/products`, `/az`, `/scraper-schedule`. + diff --git a/backend/migrations/030_add_dispensary_id_to_wp_api_permissions.sql b/backend/migrations/030_add_dispensary_id_to_wp_api_permissions.sql new file mode 100644 index 00000000..fa0beeb6 --- /dev/null +++ b/backend/migrations/030_add_dispensary_id_to_wp_api_permissions.sql @@ -0,0 +1,13 @@ +-- Migration: Add dispensary_id to wp_dutchie_api_permissions +-- This allows API tokens to be associated with a specific dispensary + +-- Add dispensary_id column to wp_dutchie_api_permissions +ALTER TABLE wp_dutchie_api_permissions +ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id); + +-- Add index for faster lookups +CREATE INDEX IF NOT EXISTS idx_wp_api_permissions_dispensary_id ON wp_dutchie_api_permissions(dispensary_id); + +-- Add dispensary_name column to return dispensary info without join +ALTER TABLE wp_dutchie_api_permissions +ADD COLUMN IF NOT EXISTS dispensary_name VARCHAR(255); diff --git a/backend/migrations/031_consolidate_schema.sql b/backend/migrations/031_consolidate_schema.sql index 9631742e..78e5bc20 100644 --- a/backend/migrations/031_consolidate_schema.sql +++ b/backend/migrations/031_consolidate_schema.sql @@ -896,6 +896,7 @@ SELECT subcategory, COUNT(*) as product_count, COUNT(DISTINCT dispensary_id) as dispensary_count, + COUNT(DISTINCT brand_name) as brand_count, AVG(thc) as avg_thc, MIN(thc) as min_thc, MAX(thc) as max_thc diff --git a/backend/migrations/031_product_normalized_fields.sql b/backend/migrations/031_product_normalized_fields.sql new file mode 100644 index 00000000..61f01fe0 --- /dev/null +++ b/backend/migrations/031_product_normalized_fields.sql @@ -0,0 +1,36 @@ +-- Migration 031: Add Normalized Fields to Products +-- For improved product matching and deduplication + +-- Add normalized columns to products table +ALTER TABLE products ADD COLUMN IF NOT EXISTS name_normalized VARCHAR(500); +ALTER TABLE products ADD COLUMN IF NOT EXISTS brand_normalized VARCHAR(255); +ALTER TABLE products ADD COLUMN IF NOT EXISTS external_id VARCHAR(255); -- Platform-specific ID (Dutchie, Treez, etc) +ALTER TABLE products ADD COLUMN IF NOT EXISTS source_platform VARCHAR(50); -- 'dutchie', 'treez', 'jane', 'wp' + +-- Create indexes for efficient matching +CREATE INDEX IF NOT EXISTS idx_products_external_id ON products(external_id) WHERE external_id IS NOT NULL; +CREATE INDEX IF NOT EXISTS idx_products_name_normalized ON products(store_id, name_normalized); +CREATE INDEX IF NOT EXISTS idx_products_matching ON products(store_id, name_normalized, brand_normalized, category_id); + +-- Backfill normalized names for existing products +UPDATE products SET + name_normalized = LOWER(TRIM(REGEXP_REPLACE(name, '[^a-zA-Z0-9 ]', ' ', 'g'))), + brand_normalized = LOWER(TRIM(COALESCE(brand, ''))), + external_id = COALESCE(external_id, dutchie_product_id), + source_platform = COALESCE(source_platform, 'dutchie') +WHERE name_normalized IS NULL; + +-- Add constraint to prevent true duplicates going forward +-- Note: We use a partial unique index to allow multiple NULLs +CREATE UNIQUE INDEX IF NOT EXISTS idx_products_no_duplicate_external_id + ON products(store_id, external_id) + WHERE external_id IS NOT NULL; + +-- Comments +COMMENT ON COLUMN products.name_normalized IS 'Lowercase, trimmed product name with punctuation removed for matching'; +COMMENT ON COLUMN products.brand_normalized IS 'Lowercase, trimmed brand name for matching'; +COMMENT ON COLUMN products.external_id IS 'Platform-specific product ID (Dutchie ID, Treez SKU, etc)'; +COMMENT ON COLUMN products.source_platform IS 'Source platform: dutchie, treez, jane, wp'; + +-- Grant permissions +GRANT SELECT, INSERT, UPDATE ON products TO scraper; diff --git a/backend/migrations/032_menu_type_and_local_images.sql b/backend/migrations/032_menu_type_and_local_images.sql new file mode 100644 index 00000000..d5e1fb71 --- /dev/null +++ b/backend/migrations/032_menu_type_and_local_images.sql @@ -0,0 +1,61 @@ +-- Migration 032: Add menu_type column and local image storage columns +-- Run with: psql $DATABASE_URL -f migrations/032_menu_type_and_local_images.sql + +-- ============================================ +-- 1. Add menu_type column to dispensaries +-- ============================================ + +-- menu_type: canonical, admin-editable field for menu provider type +-- Separate from menu_provider (auto-detected) to allow manual override +ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_type VARCHAR(50); + +-- Index for filtering by menu_type +CREATE INDEX IF NOT EXISTS idx_dispensaries_menu_type ON dispensaries(menu_type) WHERE menu_type IS NOT NULL; + +-- Backfill menu_type from existing detection data: +-- Priority: menu_provider (if set) > menu_url pattern matching +UPDATE dispensaries +SET menu_type = CASE + -- Use existing menu_provider if set + WHEN menu_provider IS NOT NULL AND menu_provider != '' THEN menu_provider + -- Detect from menu_url patterns + WHEN menu_url ILIKE '%dutchie%' THEN 'dutchie' + WHEN menu_url ILIKE '%treez%' THEN 'treez' + WHEN menu_url ILIKE '%jane%' OR menu_url ILIKE '%iheartjane%' THEN 'jane' + WHEN menu_url ILIKE '%weedmaps%' THEN 'weedmaps' + WHEN menu_url ILIKE '%leafly%' THEN 'leafly' + WHEN menu_url ILIKE '%meadow%' OR menu_url ILIKE '%getmeadow%' THEN 'meadow' + WHEN menu_url ILIKE '%blaze%' THEN 'blaze' + WHEN menu_url ILIKE '%flowhub%' THEN 'flowhub' + WHEN menu_url ILIKE '%dispenseapp%' THEN 'dispense' + WHEN menu_url ILIKE '%cova%' THEN 'cova' + ELSE NULL +END +WHERE menu_type IS NULL; + +-- ============================================ +-- 2. Add local image columns to dutchie_products +-- ============================================ + +-- local_image_url: the URL path for serving the downloaded image (e.g., /images/products/123/456.webp) +ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS local_image_url TEXT; + +-- local_image_thumb_url: thumbnail version +ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS local_image_thumb_url TEXT; + +-- local_image_medium_url: medium version +ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS local_image_medium_url TEXT; + +-- original_image_url: preserved third-party URL for fallback/reference +-- (primary_image_url will be updated to local path when downloaded) +ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS original_image_url TEXT; + +-- Backfill original_image_url from primary_image_url (preserve third-party URLs) +UPDATE dutchie_products +SET original_image_url = primary_image_url +WHERE original_image_url IS NULL AND primary_image_url IS NOT NULL; + +-- ============================================ +-- Done +-- ============================================ +SELECT 'Migration 032 completed: menu_type column added, local image columns added' as status; diff --git a/backend/migrations/033_add_platform_id_to_crawl_status.sql b/backend/migrations/033_add_platform_id_to_crawl_status.sql new file mode 100644 index 00000000..05f4abc7 --- /dev/null +++ b/backend/migrations/033_add_platform_id_to_crawl_status.sql @@ -0,0 +1,63 @@ +-- Migration 033: Add platform_dispensary_id to dispensary_crawl_status view +-- This exposes platform ID status for scheduling transparency +-- Works with both local (interval_minutes) and K8s (cron_expression) schema variants + +-- Recreate the dispensary_crawl_status view with platform_dispensary_id +DROP VIEW IF EXISTS public.dispensary_crawl_status CASCADE; +CREATE OR REPLACE VIEW public.dispensary_crawl_status AS +SELECT + d.id AS dispensary_id, + COALESCE(d.dba_name, d.name) AS dispensary_name, + d.slug AS dispensary_slug, + d.city, + d.state, + d.menu_url, + d.menu_type, + d.platform_dispensary_id, + d.scrape_enabled, + d.last_crawl_at, + d.crawl_status, + d.product_crawler_mode, + d.product_provider, + cs.interval_minutes, + cs.is_active, + cs.priority, + cs.last_run_at, + cs.next_run_at, + cs.last_status AS schedule_last_status, + cs.last_error AS schedule_last_error, + cs.consecutive_failures, + j.id AS latest_job_id, + j.status AS latest_job_status, + j.job_type AS latest_job_type, + j.started_at AS latest_job_started, + j.completed_at AS latest_job_completed, + j.products_found AS latest_products_found, + j.products_new AS latest_products_created, + j.products_updated AS latest_products_updated, + j.error_message AS latest_job_error, + -- Computed scheduling eligibility + CASE + WHEN d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL THEN true + ELSE false + END AS can_crawl, + CASE + WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'menu_type not detected' + WHEN d.menu_type != 'dutchie' THEN 'not dutchie platform' + WHEN d.platform_dispensary_id IS NULL THEN 'platform ID not resolved' + WHEN d.scrape_enabled = false THEN 'scraping disabled' + ELSE 'ready' + END AS schedule_status_reason +FROM public.dispensaries d +LEFT JOIN public.dispensary_crawl_schedule cs ON cs.dispensary_id = d.id +LEFT JOIN LATERAL ( + SELECT * + FROM public.dispensary_crawl_jobs dj + WHERE dj.dispensary_id = d.id + ORDER BY dj.created_at DESC + LIMIT 1 +) j ON true +WHERE d.state = 'AZ'; + +-- Done! +SELECT 'Migration 033 completed successfully' as status; diff --git a/backend/src/db/migrate.ts b/backend/src/db/migrate.ts index 67b44461..0f8eaf7e 100755 --- a/backend/src/db/migrate.ts +++ b/backend/src/db/migrate.ts @@ -1,7 +1,15 @@ import { Pool } from 'pg'; +// Consolidated DB connection: +// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod) +// - Then DATABASE_URL (default) +const DATABASE_URL = + process.env.CRAWLSY_DATABASE_URL || + process.env.DATABASE_URL || + 'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local'; + const pool = new Pool({ - connectionString: process.env.DATABASE_URL, + connectionString: DATABASE_URL, }); export async function runMigrations() { diff --git a/backend/src/dutchie-az/services/discovery.ts b/backend/src/dutchie-az/services/discovery.ts index e7afa95b..961daf44 100644 --- a/backend/src/dutchie-az/services/discovery.ts +++ b/backend/src/dutchie-az/services/discovery.ts @@ -154,7 +154,7 @@ export async function resolvePlatformDispensaryIds(): Promise<{ resolved: number const { rows: dispensaries } = await query( ` SELECT * FROM dispensaries - WHERE platform = 'dutchie' AND platform_dispensary_id IS NULL + WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NULL ORDER BY id ` ); @@ -199,7 +199,7 @@ export async function resolvePlatformDispensaryIds(): Promise<{ resolved: number */ export async function getAllDispensaries(): Promise { const { rows } = await query( - `SELECT * FROM dispensaries WHERE platform = 'dutchie' ORDER BY name` + `SELECT * FROM dispensaries WHERE menu_type = 'dutchie' ORDER BY name` ); return rows; } @@ -222,7 +222,7 @@ export async function getDispensariesWithPlatformIds(): Promise { const { rows } = await query( ` SELECT * FROM dispensaries - WHERE platform = 'dutchie' AND platform_dispensary_id IS NOT NULL + WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL ORDER BY name ` ); diff --git a/backend/src/dutchie-az/services/product-crawler.ts b/backend/src/dutchie-az/services/product-crawler.ts index d2cca9ef..b7fc58da 100644 --- a/backend/src/dutchie-az/services/product-crawler.ts +++ b/backend/src/dutchie-az/services/product-crawler.ts @@ -19,6 +19,7 @@ import { deriveStockStatus, calculateTotalQuantity, } from '../types'; +import { downloadProductImage, imageExists } from '../../utils/image-storage'; // ============================================================ // NORMALIZATION FUNCTIONS @@ -348,6 +349,55 @@ async function upsertProduct(product: Partial): Promise return result.rows[0].id; } +/** + * Download product image and update local image URLs + * Skips download if local image already exists for this product+URL combo + */ +async function downloadAndUpdateProductImage( + productId: number, + dispensaryId: number, + externalProductId: string, + primaryImageUrl: string | undefined +): Promise<{ downloaded: boolean; error?: string }> { + if (!primaryImageUrl) { + return { downloaded: false, error: 'No image URL' }; + } + + try { + // Check if we already have this image locally + const exists = await imageExists(dispensaryId, externalProductId, primaryImageUrl); + if (exists) { + return { downloaded: false }; + } + + // Download and process the image + const result = await downloadProductImage(primaryImageUrl, dispensaryId, externalProductId); + + if (!result.success || !result.urls) { + return { downloaded: false, error: result.error }; + } + + // Update the product record with local image URLs + await query( + ` + UPDATE dutchie_products + SET + local_image_url = $1, + local_image_thumb_url = $2, + local_image_medium_url = $3, + original_image_url = COALESCE(original_image_url, primary_image_url), + updated_at = NOW() + WHERE id = $4 + `, + [result.urls.full, result.urls.thumb, result.urls.medium, productId] + ); + + return { downloaded: true }; + } catch (error: any) { + return { downloaded: false, error: error.message }; + } +} + /** * Insert a snapshot record */ @@ -536,6 +586,8 @@ export interface CrawlResult { modeAProducts?: number; modeBProducts?: number; missingProductsMarked?: number; + imagesDownloaded?: number; + imageErrors?: number; errorMessage?: string; durationMs: number; } @@ -549,10 +601,14 @@ async function processProducts( products: DutchieRawProduct[], dispensary: Dispensary, pricingType: 'rec' | 'med', - crawlMode: CrawlMode -): Promise<{ upserted: number; snapshots: number; productIds: Set }> { + crawlMode: CrawlMode, + options: { downloadImages?: boolean } = {} +): Promise<{ upserted: number; snapshots: number; productIds: Set; imagesDownloaded: number; imageErrors: number }> { + const { downloadImages = true } = options; let upserted = 0; let snapshots = 0; + let imagesDownloaded = 0; + let imageErrors = 0; const productIds = new Set(); for (const raw of products) { @@ -569,6 +625,21 @@ async function processProducts( const productId = await upsertProduct(normalizedProduct); upserted++; + // Download image locally if enabled + if (downloadImages && normalizedProduct.primaryImageUrl) { + const imageResult = await downloadAndUpdateProductImage( + productId, + dispensary.id, + externalId, + normalizedProduct.primaryImageUrl + ); + if (imageResult.downloaded) { + imagesDownloaded++; + } else if (imageResult.error && imageResult.error !== 'No image URL') { + imageErrors++; + } + } + // Create snapshot with crawl mode const snapshot = normalizeSnapshot( raw, @@ -585,7 +656,7 @@ async function processProducts( } } - return { upserted, snapshots, productIds }; + return { upserted, snapshots, productIds, imagesDownloaded, imageErrors }; } /** @@ -598,9 +669,9 @@ async function processProducts( export async function crawlDispensaryProducts( dispensary: Dispensary, pricingType: 'rec' | 'med' = 'rec', - options: { useBothModes?: boolean } = {} + options: { useBothModes?: boolean; downloadImages?: boolean } = {} ): Promise { - const { useBothModes = true } = options; + const { useBothModes = true, downloadImages = true } = options; const startTime = Date.now(); if (!dispensary.platformDispensaryId) { @@ -620,6 +691,8 @@ export async function crawlDispensaryProducts( let totalUpserted = 0; let totalSnapshots = 0; + let totalImagesDownloaded = 0; + let totalImageErrors = 0; let modeAProducts = 0; let modeBProducts = 0; let missingMarked = 0; @@ -656,10 +729,13 @@ export async function crawlDispensaryProducts( bothResults.merged.products, dispensary, pricingType, - 'mode_a' // Use mode_a for merged products (convention) + 'mode_a', // Use mode_a for merged products (convention) + { downloadImages } ); totalUpserted = mergedResult.upserted; totalSnapshots = mergedResult.snapshots; + totalImagesDownloaded = mergedResult.imagesDownloaded; + totalImageErrors = mergedResult.imageErrors; } } else { // Single mode crawl (Mode A only) @@ -676,9 +752,11 @@ export async function crawlDispensaryProducts( modeAProductIds.add(p._id); } - const result = await processProducts(products, dispensary, pricingType, crawlMode); + const result = await processProducts(products, dispensary, pricingType, crawlMode, { downloadImages }); totalUpserted = result.upserted; totalSnapshots = result.snapshots; + totalImagesDownloaded = result.imagesDownloaded; + totalImageErrors = result.imageErrors; } // Mark products as missing using UNION of Mode A + Mode B @@ -695,7 +773,7 @@ export async function crawlDispensaryProducts( // Update dispensary stats await updateDispensaryCrawlStats(dispensary.id, totalUpserted); - console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing`); + console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`); return { success: true, @@ -706,6 +784,8 @@ export async function crawlDispensaryProducts( modeAProducts, modeBProducts, missingProductsMarked: missingMarked, + imagesDownloaded: totalImagesDownloaded, + imageErrors: totalImageErrors, durationMs: Date.now() - startTime, }; } catch (error: any) { @@ -734,7 +814,7 @@ export async function crawlAllArizonaDispensaries( const { rows: dispensaries } = await query( ` SELECT * FROM dispensaries - WHERE state = 'AZ' AND platform = 'dutchie' AND platform_dispensary_id IS NOT NULL + WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL ORDER BY id ` ); diff --git a/backend/src/dutchie-az/services/scheduler.ts b/backend/src/dutchie-az/services/scheduler.ts index 3bb7f8bb..125cf877 100644 --- a/backend/src/dutchie-az/services/scheduler.ts +++ b/backend/src/dutchie-az/services/scheduler.ts @@ -452,7 +452,7 @@ async function executeProductCrawl(config: Record): Promise<{ const { rows: dispensaries } = await query( ` SELECT * FROM dispensaries - WHERE state = 'AZ' AND platform = 'dutchie' AND platform_dispensary_id IS NOT NULL + WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL ORDER BY last_crawled_at ASC NULLS FIRST ` ); diff --git a/backend/src/index.ts b/backend/src/index.ts index a872d499..b2c742e4 100755 --- a/backend/src/index.ts +++ b/backend/src/index.ts @@ -3,6 +3,7 @@ import cors from 'cors'; import path from 'path'; import dotenv from 'dotenv'; import { initializeMinio, isMinioEnabled } from './utils/minio'; +import { initializeImageStorage } from './utils/image-storage'; import { logger } from './services/logger'; import { cleanupOrphanedJobs } from './services/proxyTestQueue'; @@ -102,6 +103,7 @@ async function startServer() { logger.info('system', 'Starting server...'); await initializeMinio(); + await initializeImageStorage(); logger.info('system', isMinioEnabled() ? 'MinIO storage initialized' : 'Local filesystem storage initialized'); // Clean up any orphaned proxy test jobs from previous server runs diff --git a/backend/src/routes/dashboard.ts b/backend/src/routes/dashboard.ts index cb75c52d..ab6eddc5 100755 --- a/backend/src/routes/dashboard.ts +++ b/backend/src/routes/dashboard.ts @@ -1,16 +1,15 @@ import { Router } from 'express'; import { authMiddleware } from '../auth/middleware'; import { pool } from '../db/migrate'; -import { query as azQuery } from '../dutchie-az/db/connection'; // AZ pipeline DB const router = Router(); router.use(authMiddleware); -// Get dashboard stats - now sourced from AZ pipeline (dutchie_az DB) +// Get dashboard stats - consolidated DB (all tables in one DB now) router.get('/stats', async (req, res) => { try { - // Dispensary stats (AZ pipeline) - const dispensariesResult = await azQuery(` + // Dispensary stats + const dispensariesResult = await pool.query(` SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE scrape_enabled = true) as active, @@ -21,8 +20,8 @@ router.get('/stats', async (req, res) => { FROM dispensaries `); - // Product stats from AZ pipeline (dutchie_products) - const productsResult = await azQuery(` + // Product stats from dutchie_products table + const productsResult = await pool.query(` SELECT COUNT(*) as total, COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock, @@ -47,8 +46,8 @@ router.get('/stats', async (req, res) => { WHERE clicked_at >= NOW() - INTERVAL '24 hours' `); - // Recent products added (last 24 hours) from AZ pipeline - const recentProductsResult = await azQuery(` + // Recent products added (last 24 hours) + const recentProductsResult = await pool.query(` SELECT COUNT(*) as new_products_24h FROM dutchie_products WHERE created_at >= NOW() - INTERVAL '24 hours' @@ -77,7 +76,7 @@ router.get('/stats', async (req, res) => { } }); -// Get recent activity - uses AZ data +// Get recent activity - from consolidated DB router.get('/activity', async (req, res) => { try { const { limit = 20 } = req.query; diff --git a/backend/src/routes/dispensaries.ts b/backend/src/routes/dispensaries.ts index 674782a1..1e07b00a 100644 --- a/backend/src/routes/dispensaries.ts +++ b/backend/src/routes/dispensaries.ts @@ -5,10 +5,15 @@ import { pool } from '../db/migrate'; const router = Router(); router.use(authMiddleware); +// Valid menu_type values +const VALID_MENU_TYPES = ['dutchie', 'treez', 'jane', 'weedmaps', 'leafly', 'meadow', 'blaze', 'flowhub', 'dispense', 'cova', 'other', 'unknown']; + // Get all dispensaries router.get('/', async (req, res) => { try { - const result = await pool.query(` + const { menu_type } = req.query; + + let query = ` SELECT id, azdhs_id, @@ -30,14 +35,29 @@ router.get('/', async (req, res) => { latitude, longitude, menu_url, + menu_type, + menu_provider, + menu_provider_confidence, scraper_template, last_menu_scrape, menu_scrape_status, + platform_dispensary_id, created_at, updated_at FROM dispensaries - ORDER BY name - `); + `; + + const params: any[] = []; + + // Filter by menu_type if provided + if (menu_type) { + query += ` WHERE menu_type = $1`; + params.push(menu_type); + } + + query += ` ORDER BY name`; + + const result = await pool.query(query, params); res.json({ dispensaries: result.rows }); } catch (error) { @@ -46,6 +66,22 @@ router.get('/', async (req, res) => { } }); +// Get menu type stats +router.get('/stats/menu-types', async (req, res) => { + try { + const result = await pool.query(` + SELECT menu_type, COUNT(*) as count + FROM dispensaries + GROUP BY menu_type + ORDER BY count DESC + `); + res.json({ menu_types: result.rows, valid_types: VALID_MENU_TYPES }); + } catch (error) { + console.error('Error fetching menu type stats:', error); + res.status(500).json({ error: 'Failed to fetch menu type stats' }); + } +}); + // Get single dispensary by slug router.get('/:slug', async (req, res) => { try { @@ -73,10 +109,14 @@ router.get('/:slug', async (req, res) => { latitude, longitude, menu_url, + menu_type, + menu_provider, + menu_provider_confidence, scraper_template, scraper_config, last_menu_scrape, menu_scrape_status, + platform_dispensary_id, created_at, updated_at FROM dispensaries @@ -106,11 +146,19 @@ router.put('/:id', async (req, res) => { google_rating, google_review_count, menu_url, + menu_type, scraper_template, scraper_config, menu_scrape_status } = req.body; + // Validate menu_type if provided + if (menu_type !== undefined && menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) { + return res.status(400).json({ + error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')}` + }); + } + const result = await pool.query(` UPDATE dispensaries SET @@ -121,11 +169,12 @@ router.put('/:id', async (req, res) => { google_rating = COALESCE($5, google_rating), google_review_count = COALESCE($6, google_review_count), menu_url = COALESCE($7, menu_url), - scraper_template = COALESCE($8, scraper_template), - scraper_config = COALESCE($9, scraper_config), - menu_scrape_status = COALESCE($10, menu_scrape_status), + menu_type = COALESCE($8, menu_type), + scraper_template = COALESCE($9, scraper_template), + scraper_config = COALESCE($10, scraper_config), + menu_scrape_status = COALESCE($11, menu_scrape_status), updated_at = CURRENT_TIMESTAMP - WHERE id = $11 + WHERE id = $12 RETURNING * `, [ dba_name, @@ -135,6 +184,7 @@ router.put('/:id', async (req, res) => { google_rating, google_review_count, menu_url, + menu_type, scraper_template, scraper_config, menu_scrape_status, @@ -384,4 +434,72 @@ router.post('/:slug/scrape', async (req, res) => { } }); +// Update menu_type for a dispensary (dedicated endpoint) +router.patch('/:id/menu-type', async (req, res) => { + try { + const { id } = req.params; + const { menu_type } = req.body; + + // Validate menu_type + if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) { + return res.status(400).json({ + error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)` + }); + } + + const result = await pool.query(` + UPDATE dispensaries + SET menu_type = $1, updated_at = CURRENT_TIMESTAMP + WHERE id = $2 + RETURNING id, name, slug, menu_type, menu_provider, menu_url + `, [menu_type || null, id]); + + if (result.rows.length === 0) { + return res.status(404).json({ error: 'Dispensary not found' }); + } + + res.json({ + success: true, + dispensary: result.rows[0] + }); + } catch (error) { + console.error('Error updating menu_type:', error); + res.status(500).json({ error: 'Failed to update menu_type' }); + } +}); + +// Bulk update menu_type for multiple dispensaries +router.post('/bulk/menu-type', async (req, res) => { + try { + const { dispensary_ids, menu_type } = req.body; + + if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) { + return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' }); + } + + // Validate menu_type + if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) { + return res.status(400).json({ + error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)` + }); + } + + const result = await pool.query(` + UPDATE dispensaries + SET menu_type = $1, updated_at = CURRENT_TIMESTAMP + WHERE id = ANY($2::int[]) + RETURNING id, name, slug, menu_type + `, [menu_type || null, dispensary_ids]); + + res.json({ + success: true, + updated_count: result.rowCount, + dispensaries: result.rows + }); + } catch (error) { + console.error('Error bulk updating menu_type:', error); + res.status(500).json({ error: 'Failed to bulk update menu_type' }); + } +}); + export default router; diff --git a/backend/src/routes/schedule.ts b/backend/src/routes/schedule.ts index 44ef84b3..ec8f56ee 100644 --- a/backend/src/routes/schedule.ts +++ b/backend/src/routes/schedule.ts @@ -27,6 +27,7 @@ import { ensureAllDispensariesHaveSchedules, } from '../services/dispensary-orchestrator'; import { pool } from '../db/migrate'; +import { resolveDispensaryId } from '../dutchie-az/services/graphql-client'; const router = Router(); router.use(authMiddleware); @@ -354,14 +355,91 @@ router.get('/due', async (req: Request, res: Response) => { /** * GET /api/schedule/dispensaries - * Get all dispensary schedule statuses (uses the view) + * Get all dispensary schedule statuses with optional filters + * Query params: + * - state: filter by state (e.g., 'AZ') + * - search: search by name or slug */ router.get('/dispensaries', async (req: Request, res: Response) => { try { - const result = await pool.query(` - SELECT * FROM dispensary_crawl_status - ORDER BY priority DESC, dispensary_name - `); + const { state, search } = req.query; + + // Build dynamic query with optional filters + const conditions: string[] = []; + const params: any[] = []; + let paramIndex = 1; + + if (state) { + conditions.push(`d.state = $${paramIndex}`); + params.push(state); + paramIndex++; + } + + if (search) { + conditions.push(`(d.name ILIKE $${paramIndex} OR d.slug ILIKE $${paramIndex} OR d.dba_name ILIKE $${paramIndex})`); + params.push(`%${search}%`); + paramIndex++; + } + + const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : ''; + + const query = ` + SELECT + d.id AS dispensary_id, + COALESCE(d.dba_name, d.name) AS dispensary_name, + d.slug AS dispensary_slug, + d.city, + d.state, + d.menu_url, + d.menu_type, + d.platform_dispensary_id, + d.scrape_enabled, + d.last_crawl_at, + d.crawl_status, + d.product_crawler_mode, + d.product_provider, + cs.interval_minutes, + cs.is_active, + cs.priority, + cs.last_run_at, + cs.next_run_at, + cs.last_status AS schedule_last_status, + cs.last_error AS schedule_last_error, + cs.consecutive_failures, + j.id AS latest_job_id, + j.status AS latest_job_status, + j.job_type AS latest_job_type, + j.started_at AS latest_job_started, + j.completed_at AS latest_job_completed, + j.products_found AS latest_products_found, + j.products_new AS latest_products_created, + j.products_updated AS latest_products_updated, + j.error_message AS latest_job_error, + CASE + WHEN d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL THEN true + ELSE false + END AS can_crawl, + CASE + WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'menu_type not detected' + WHEN d.menu_type != 'dutchie' THEN 'not dutchie platform' + WHEN d.platform_dispensary_id IS NULL THEN 'platform ID not resolved' + WHEN d.scrape_enabled = false THEN 'scraping disabled' + ELSE 'ready' + END AS schedule_status_reason + FROM public.dispensaries d + LEFT JOIN public.dispensary_crawl_schedule cs ON cs.dispensary_id = d.id + LEFT JOIN LATERAL ( + SELECT * + FROM public.dispensary_crawl_jobs dj + WHERE dj.dispensary_id = d.id + ORDER BY dj.created_at DESC + LIMIT 1 + ) j ON true + ${whereClause} + ORDER BY cs.priority DESC NULLS LAST, COALESCE(d.dba_name, d.name) + `; + + const result = await pool.query(query, params); res.json({ dispensaries: result.rows }); } catch (error: any) { console.error('Error fetching dispensary schedules:', error); @@ -589,4 +667,319 @@ router.post('/dispensaries/bootstrap', requireRole('superadmin', 'admin'), async } }); +// ============================================ +// Platform ID & Menu Type Detection Endpoints +// ============================================ + +/** + * POST /api/schedule/dispensaries/:id/resolve-platform-id + * Resolve the Dutchie platform_dispensary_id from menu_url slug + */ +router.post('/dispensaries/:id/resolve-platform-id', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => { + try { + const dispensaryId = parseInt(req.params.id); + if (isNaN(dispensaryId)) { + return res.status(400).json({ error: 'Invalid dispensary ID' }); + } + + // Get dispensary info + const dispensaryResult = await pool.query(` + SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id + FROM dispensaries WHERE id = $1 + `, [dispensaryId]); + + if (dispensaryResult.rows.length === 0) { + return res.status(404).json({ error: 'Dispensary not found' }); + } + + const dispensary = dispensaryResult.rows[0]; + + // Check if already resolved + if (dispensary.platform_dispensary_id) { + return res.json({ + success: true, + message: 'Platform ID already resolved', + platform_dispensary_id: dispensary.platform_dispensary_id, + already_resolved: true + }); + } + + // Extract slug from menu_url for Dutchie URLs + let slugToResolve = dispensary.slug; + if (dispensary.menu_url) { + // Match embedded-menu or dispensary URLs + const match = dispensary.menu_url.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i); + if (match) { + slugToResolve = match[1]; + } + } + + if (!slugToResolve) { + return res.status(400).json({ + error: 'No slug available to resolve platform ID', + menu_url: dispensary.menu_url + }); + } + + console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`); + + // Resolve platform ID using GraphQL client + const platformId = await resolveDispensaryId(slugToResolve); + + if (!platformId) { + return res.status(404).json({ + error: 'Could not resolve platform ID', + slug_tried: slugToResolve, + message: 'The dispensary might not be on Dutchie or the slug is incorrect' + }); + } + + // Update the dispensary with resolved platform ID + await pool.query(` + UPDATE dispensaries + SET platform_dispensary_id = $1, + menu_type = COALESCE(menu_type, 'dutchie'), + updated_at = NOW() + WHERE id = $2 + `, [platformId, dispensaryId]); + + res.json({ + success: true, + platform_dispensary_id: platformId, + slug_resolved: slugToResolve, + message: `Platform ID resolved: ${platformId}` + }); + } catch (error: any) { + console.error('Error resolving platform ID:', error); + res.status(500).json({ error: 'Failed to resolve platform ID', details: error.message }); + } +}); + +/** + * POST /api/schedule/dispensaries/:id/detect-menu-type + * Detect menu type from menu_url + */ +router.post('/dispensaries/:id/detect-menu-type', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => { + try { + const dispensaryId = parseInt(req.params.id); + if (isNaN(dispensaryId)) { + return res.status(400).json({ error: 'Invalid dispensary ID' }); + } + + // Get dispensary info + const dispensaryResult = await pool.query(` + SELECT id, name, menu_url, website FROM dispensaries WHERE id = $1 + `, [dispensaryId]); + + if (dispensaryResult.rows.length === 0) { + return res.status(404).json({ error: 'Dispensary not found' }); + } + + const dispensary = dispensaryResult.rows[0]; + const urlToCheck = dispensary.menu_url || dispensary.website; + + if (!urlToCheck) { + return res.status(400).json({ error: 'No menu_url or website to detect from' }); + } + + // Detect menu type from URL patterns + let detectedType: string = 'unknown'; + + if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) { + detectedType = 'dutchie'; + } else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) { + detectedType = 'jane'; + } else if (urlToCheck.includes('weedmaps.com')) { + detectedType = 'weedmaps'; + } else if (urlToCheck.includes('leafly.com')) { + detectedType = 'leafly'; + } else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) { + detectedType = 'treez'; + } else if (urlToCheck.includes('meadow.com')) { + detectedType = 'meadow'; + } else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) { + detectedType = 'blaze'; + } else if (urlToCheck.includes('flowhub.com')) { + detectedType = 'flowhub'; + } else if (urlToCheck.includes('dispense.app')) { + detectedType = 'dispense'; + } else if (urlToCheck.includes('covasoft.com')) { + detectedType = 'cova'; + } + + // Update menu_type + await pool.query(` + UPDATE dispensaries + SET menu_type = $1, updated_at = NOW() + WHERE id = $2 + `, [detectedType, dispensaryId]); + + res.json({ + success: true, + menu_type: detectedType, + url_checked: urlToCheck, + message: `Menu type detected: ${detectedType}` + }); + } catch (error: any) { + console.error('Error detecting menu type:', error); + res.status(500).json({ error: 'Failed to detect menu type' }); + } +}); + +/** + * POST /api/schedule/dispensaries/:id/refresh-detection + * Combined: detect menu_type AND resolve platform_dispensary_id if dutchie + */ +router.post('/dispensaries/:id/refresh-detection', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => { + try { + const dispensaryId = parseInt(req.params.id); + if (isNaN(dispensaryId)) { + return res.status(400).json({ error: 'Invalid dispensary ID' }); + } + + // Get dispensary info + const dispensaryResult = await pool.query(` + SELECT id, name, slug, menu_url, website FROM dispensaries WHERE id = $1 + `, [dispensaryId]); + + if (dispensaryResult.rows.length === 0) { + return res.status(404).json({ error: 'Dispensary not found' }); + } + + const dispensary = dispensaryResult.rows[0]; + const urlToCheck = dispensary.menu_url || dispensary.website; + + if (!urlToCheck) { + return res.status(400).json({ error: 'No menu_url or website to detect from' }); + } + + // Detect menu type from URL patterns + let detectedType: string = 'unknown'; + + if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) { + detectedType = 'dutchie'; + } else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) { + detectedType = 'jane'; + } else if (urlToCheck.includes('weedmaps.com')) { + detectedType = 'weedmaps'; + } else if (urlToCheck.includes('leafly.com')) { + detectedType = 'leafly'; + } else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) { + detectedType = 'treez'; + } else if (urlToCheck.includes('meadow.com')) { + detectedType = 'meadow'; + } else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) { + detectedType = 'blaze'; + } else if (urlToCheck.includes('flowhub.com')) { + detectedType = 'flowhub'; + } else if (urlToCheck.includes('dispense.app')) { + detectedType = 'dispense'; + } else if (urlToCheck.includes('covasoft.com')) { + detectedType = 'cova'; + } + + // Update menu_type first + await pool.query(` + UPDATE dispensaries SET menu_type = $1, updated_at = NOW() WHERE id = $2 + `, [detectedType, dispensaryId]); + + let platformId: string | null = null; + + // If dutchie, also try to resolve platform ID + if (detectedType === 'dutchie') { + let slugToResolve = dispensary.slug; + const match = urlToCheck.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i); + if (match) { + slugToResolve = match[1]; + } + + if (slugToResolve) { + try { + console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`); + platformId = await resolveDispensaryId(slugToResolve); + + if (platformId) { + await pool.query(` + UPDATE dispensaries SET platform_dispensary_id = $1, updated_at = NOW() WHERE id = $2 + `, [platformId, dispensaryId]); + } + } catch (err: any) { + console.warn(`[Schedule] Failed to resolve platform ID: ${err.message}`); + } + } + } + + res.json({ + success: true, + menu_type: detectedType, + platform_dispensary_id: platformId, + url_checked: urlToCheck, + can_crawl: detectedType === 'dutchie' && !!platformId + }); + } catch (error: any) { + console.error('Error refreshing detection:', error); + res.status(500).json({ error: 'Failed to refresh detection' }); + } +}); + +/** + * PUT /api/schedule/dispensaries/:id/toggle-active + * Enable or disable schedule for a dispensary + */ +router.put('/dispensaries/:id/toggle-active', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => { + try { + const dispensaryId = parseInt(req.params.id); + if (isNaN(dispensaryId)) { + return res.status(400).json({ error: 'Invalid dispensary ID' }); + } + + const { is_active } = req.body; + + // Upsert schedule with new is_active value + const result = await pool.query(` + INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority) + VALUES ($1, $2, 240, 0) + ON CONFLICT (dispensary_id) DO UPDATE SET + is_active = $2, + updated_at = NOW() + RETURNING * + `, [dispensaryId, is_active]); + + res.json({ + success: true, + schedule: result.rows[0], + message: is_active ? 'Schedule enabled' : 'Schedule disabled' + }); + } catch (error: any) { + console.error('Error toggling schedule active status:', error); + res.status(500).json({ error: 'Failed to toggle schedule' }); + } +}); + +/** + * DELETE /api/schedule/dispensaries/:id/schedule + * Delete schedule for a dispensary + */ +router.delete('/dispensaries/:id/schedule', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => { + try { + const dispensaryId = parseInt(req.params.id); + if (isNaN(dispensaryId)) { + return res.status(400).json({ error: 'Invalid dispensary ID' }); + } + + const result = await pool.query(` + DELETE FROM dispensary_crawl_schedule WHERE dispensary_id = $1 RETURNING id + `, [dispensaryId]); + + res.json({ + success: true, + deleted: result.rowCount > 0, + message: result.rowCount > 0 ? 'Schedule deleted' : 'No schedule to delete' + }); + } catch (error: any) { + console.error('Error deleting schedule:', error); + res.status(500).json({ error: 'Failed to delete schedule' }); + } +}); + export default router; diff --git a/backend/src/scraper-v2/engine.ts b/backend/src/scraper-v2/engine.ts index 2c61ebfd..cda2f375 100644 --- a/backend/src/scraper-v2/engine.ts +++ b/backend/src/scraper-v2/engine.ts @@ -425,7 +425,29 @@ export class DutchieSpider { href = window.location.origin + href; } - items.push({ name, price, originalPrice, href }); + // Extract image URL from product card + let imageUrl = null; + const imgSelectors = [ + 'img[src*="images.dutchie.com"]', + 'img[src*="dutchie"]', + 'img[data-testid*="product"]', + 'img[class*="product"]', + 'img[class*="Product"]', + 'picture img', + 'img' + ]; + for (const sel of imgSelectors) { + const img = card.querySelector(sel); + if (img) { + const src = img.getAttribute('src') || img.getAttribute('data-src') || ''; + if (src && (src.includes('dutchie.com') || src.includes('images.'))) { + imageUrl = src; + break; + } + } + } + + items.push({ name, price, originalPrice, href, imageUrl }); } catch (err) { console.error('Error parsing product card:', err); @@ -447,6 +469,7 @@ export class DutchieSpider { productName: card.name, productPrice: card.price, productOriginalPrice: card.originalPrice, + productImageUrl: card.imageUrl, // Pass image from category page requiresBrowser: true }, callback: this.parseProductPage.bind(this) @@ -472,21 +495,27 @@ export class DutchieSpider { // @ts-ignore - runs in browser context const allText = document.body.textContent || ''; - // Extract image + // Extract image - expanded selectors for better coverage let fullSizeImage = null; const mainImageSelectors = [ + 'img[src*="images.dutchie.com"]', + 'img[src*="dutchie"]', 'img[class*="ProductImage"]', 'img[class*="product-image"]', + 'img[class*="Product"]', '[class*="ImageGallery"] img', - 'main img', - 'img[src*="images.dutchie.com"]' + '[data-testid*="product"] img', + '[data-testid*="image"] img', + 'picture img', + 'main img' ]; for (const sel of mainImageSelectors) { // @ts-ignore - runs in browser context const img = document.querySelector(sel) as any; - if (img?.src && img.src.includes('dutchie.com')) { - fullSizeImage = img.src; + const src = img?.src || img?.getAttribute('data-src') || ''; + if (src && (src.includes('dutchie.com') || src.includes('images.'))) { + fullSizeImage = src; break; } } @@ -593,6 +622,9 @@ export class DutchieSpider { }); // Create product item + // Use image from product page, fallback to category page image + const imageUrl = details.fullSizeImage || response.request.metadata.productImageUrl || undefined; + const product: Product & { storeId: number; categoryId: number } = { dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`, name: productName || 'Unknown Product', @@ -603,7 +635,7 @@ export class DutchieSpider { cbdPercentage: details.cbd || undefined, strainType: details.strainType || undefined, brand: details.brand || undefined, - imageUrl: details.fullSizeImage || undefined, + imageUrl: imageUrl, dutchieUrl: response.url, metadata: { terpenes: details.terpenes, diff --git a/backend/src/scraper-v2/index.ts b/backend/src/scraper-v2/index.ts index c994750e..4053f9db 100644 --- a/backend/src/scraper-v2/index.ts +++ b/backend/src/scraper-v2/index.ts @@ -1,6 +1,13 @@ /** * Scraper V2 - Scrapy-inspired web scraping framework * + * IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module. + * Dutchie crawling must go through the dutchie-az GraphQL pipeline: + * src/dutchie-az/services/product-crawler.ts + * + * This scraper-v2 module uses DOM-based extraction which is unreliable + * for Dutchie. The new dutchie-az pipeline uses GraphQL directly. + * * Architecture: * - Engine: Main orchestrator * - Scheduler: Priority queue with deduplication diff --git a/backend/src/scraper-v2/pipelines.ts b/backend/src/scraper-v2/pipelines.ts index 5f4ed3b4..37f1163a 100644 --- a/backend/src/scraper-v2/pipelines.ts +++ b/backend/src/scraper-v2/pipelines.ts @@ -2,6 +2,7 @@ import { ItemPipeline, Product } from './types'; import { logger } from '../services/logger'; import { pool } from '../db/migrate'; import { uploadImageFromUrl } from '../utils/minio'; +import { normalizeProductName, normalizeBrandName } from '../utils/product-normalizer'; /** * Validation Pipeline - ensures data quality @@ -166,12 +167,25 @@ function generateSlug(name: string): string { } /** - * Database Pipeline - saves items to database + * Database Pipeline - saves items to database with improved matching + * + * MATCHING PRIORITY: + * 1. external_id (dutchie_product_id) - exact match + * 2. normalized name + brand + category - strong match + * 3. normalized name + category - weak match (same product, different/missing brand) + * + * ALWAYS creates a snapshot after upsert for historical tracking. */ export class DatabasePipeline implements ItemPipeline { name = 'DatabasePipeline'; priority = 10; // Low priority - runs last + private crawlId: string | null = null; + + setCrawlId(id: string): void { + this.crawlId = id; + } + async process(item: Product, spider: string): Promise { const client = await pool.connect(); @@ -180,78 +194,155 @@ export class DatabasePipeline implements ItemPipeline { const storeId = (item as any).storeId; const categoryId = (item as any).categoryId; const dispensaryId = (item as any).dispensaryId; + const categoryName = (item as any).categoryName; - // Generate slug from name + // Generate normalized values for matching + const nameNormalized = normalizeProductName(item.name); + const brandNormalized = normalizeBrandName(item.brand); const slug = generateSlug(item.name); + const externalId = item.dutchieProductId || null; if (!storeId || !categoryId) { logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`); return null; } - // Check if product exists - const existingResult = await client.query(` - SELECT id, image_url, local_image_path - FROM products - WHERE store_id = $1 AND name = $2 AND category_id = $3 - `, [storeId, item.name, categoryId]); + let productId: number | null = null; + let localImagePath: string | null = null; + let isNewProduct = false; - let localImagePath = null; - let productId: number; + // STEP 1: Try to match by external_id (most reliable) + if (externalId) { + const extMatch = await client.query(` + SELECT id, image_url, local_image_path + FROM products + WHERE store_id = $1 AND (external_id = $2 OR dutchie_product_id = $2) + `, [storeId, externalId]); - if (existingResult.rows.length > 0) { + if (extMatch.rows.length > 0) { + productId = extMatch.rows[0].id; + localImagePath = extMatch.rows[0].local_image_path; + logger.debug('pipeline', `Matched by external_id: ${item.name}`); + } + } + + // STEP 2: Try to match by normalized name + brand + category + if (!productId) { + const normMatch = await client.query(` + SELECT id, image_url, local_image_path + FROM products + WHERE store_id = $1 + AND name_normalized = $2 + AND brand_normalized = $3 + AND category_id = $4 + `, [storeId, nameNormalized, brandNormalized, categoryId]); + + if (normMatch.rows.length > 0) { + productId = normMatch.rows[0].id; + localImagePath = normMatch.rows[0].local_image_path; + logger.debug('pipeline', `Matched by normalized name+brand+category: ${item.name}`); + } + } + + // STEP 3: Fallback to normalized name + category only (weaker match) + if (!productId) { + const weakMatch = await client.query(` + SELECT id, image_url, local_image_path + FROM products + WHERE store_id = $1 + AND name_normalized = $2 + AND category_id = $3 + LIMIT 1 + `, [storeId, nameNormalized, categoryId]); + + if (weakMatch.rows.length === 1) { + productId = weakMatch.rows[0].id; + localImagePath = weakMatch.rows[0].local_image_path; + logger.debug('pipeline', `Matched by normalized name+category: ${item.name}`); + } + } + + // STEP 4: Final fallback - exact name match (legacy compatibility) + if (!productId) { + const exactMatch = await client.query(` + SELECT id, image_url, local_image_path + FROM products + WHERE store_id = $1 AND name = $2 AND category_id = $3 + `, [storeId, item.name, categoryId]); + + if (exactMatch.rows.length > 0) { + productId = exactMatch.rows[0].id; + localImagePath = exactMatch.rows[0].local_image_path; + logger.debug('pipeline', `Matched by exact name: ${item.name}`); + } + } + + // UPDATE or INSERT + if (productId) { // Update existing product - productId = existingResult.rows[0].id; - localImagePath = existingResult.rows[0].local_image_path; - await client.query(` UPDATE products SET name = $1, description = $2, price = $3, strain_type = $4, thc_percentage = $5, cbd_percentage = $6, - brand = $7, weight = $8, image_url = $9, dutchie_url = $10, + brand = $7, weight = $8, image_url = COALESCE($9, image_url), dutchie_url = $10, in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP, - updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14) + updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14), + name_normalized = $15, brand_normalized = $16, + external_id = COALESCE(external_id, $17), source_platform = COALESCE(source_platform, 'dutchie') WHERE id = $12 `, [ item.name, item.description, item.price, item.strainType, item.thcPercentage, item.cbdPercentage, item.brand, item.weight, item.imageUrl, item.dutchieUrl, - JSON.stringify(item.metadata || {}), productId, dispensaryId, slug + JSON.stringify(item.metadata || {}), productId, dispensaryId, slug, + nameNormalized, brandNormalized, externalId ]); logger.debug('pipeline', `Updated product: ${item.name}`); } else { // Insert new product + isNewProduct = true; const insertResult = await client.query(` INSERT INTO products ( - store_id, category_id, dispensary_id, dutchie_product_id, slug, name, description, + store_id, category_id, dispensary_id, dutchie_product_id, external_id, + slug, name, name_normalized, description, price, strain_type, thc_percentage, cbd_percentage, - brand, weight, image_url, dutchie_url, in_stock, metadata - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16) + brand, brand_normalized, weight, image_url, dutchie_url, in_stock, metadata, + source_platform + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, true, $19, 'dutchie') RETURNING id `, [ - storeId, categoryId, dispensaryId, item.dutchieProductId, slug, item.name, item.description, + storeId, categoryId, dispensaryId, externalId, externalId, + slug, item.name, nameNormalized, item.description, item.price, item.strainType, item.thcPercentage, item.cbdPercentage, - item.brand, item.weight, item.imageUrl, item.dutchieUrl, + item.brand, brandNormalized, item.weight, item.imageUrl, item.dutchieUrl, JSON.stringify(item.metadata || {}) ]); productId = insertResult.rows[0].id; - logger.debug('pipeline', `Inserted new product: ${item.name}`); + logger.debug('pipeline', `Inserted NEW product: ${item.name}`); } - // Download image if needed - if (item.imageUrl && !localImagePath) { + // ALWAYS create a snapshot for historical tracking + await this.createSnapshot(client, { + productId: productId!, + dispensaryId, + externalId, + slug, + item, + categoryName + }); + + // Download image if needed (only for new products or missing local image) + if (item.imageUrl && !localImagePath && productId) { try { - // Get store slug for organized image storage const storeResult = await client.query( 'SELECT slug FROM stores WHERE id = $1', [storeId] ); const storeSlug = storeResult.rows[0]?.slug || undefined; - const imageSizes = await uploadImageFromUrl(item.imageUrl, productId, storeSlug); - // Use thumbnail path for local_image_path + const imageSizes = await uploadImageFromUrl(item.imageUrl, productId!, storeSlug); localImagePath = imageSizes.thumbnail; await client.query(` UPDATE products SET local_image_path = $1 WHERE id = $2 @@ -262,6 +353,10 @@ export class DatabasePipeline implements ItemPipeline { } } + // Attach metadata for stats tracking + (item as any).isNewProduct = isNewProduct; + (item as any).productId = productId; + return item; } catch (error) { @@ -271,6 +366,78 @@ export class DatabasePipeline implements ItemPipeline { client.release(); } } + + /** + * Create a snapshot record for historical tracking + */ + private async createSnapshot( + client: any, + params: { + productId: number; + dispensaryId: number | null; + externalId: string | null; + slug: string; + item: Product; + categoryName?: string; + } + ): Promise { + try { + // Only create snapshots if the table exists (graceful degradation) + const tableExists = await client.query(` + SELECT EXISTS ( + SELECT FROM information_schema.tables + WHERE table_name = 'product_snapshots' + ) + `); + + if (!tableExists.rows[0].exists) { + return; // Snapshot table not yet created + } + + const crawlId = this.crawlId || crypto.randomUUID(); + const { productId, dispensaryId, externalId, slug, item, categoryName } = params; + + await client.query(` + INSERT INTO product_snapshots ( + crawl_id, dispensary_id, external_product_id, product_slug, + name, brand, category, price, original_price, sale_price, + discount_type, discount_value, availability_status, stock_quantity, + thc_percentage, cbd_percentage, strain_type, weight, variant, + description, image_url, effects, terpenes, captured_at + ) VALUES ( + $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, NOW() + ) + `, [ + crawlId, + dispensaryId, + externalId, + slug, + item.name, + item.brand || null, + categoryName || null, + item.price || null, + item.originalPrice || null, + item.metadata?.salePrice || null, + item.metadata?.discountType || null, + item.metadata?.discountValue || null, + 'in_stock', // availability_status - if we scraped it, it's in stock + item.metadata?.stockQuantity || null, + item.thcPercentage || null, + item.cbdPercentage || null, + item.strainType || null, + item.weight || null, + item.metadata?.variant || null, + item.description || null, + item.imageUrl || null, + item.metadata?.effects || null, + item.metadata?.terpenes || null + ]); + + } catch (error) { + // Don't fail the whole pipeline if snapshot creation fails + logger.warn('pipeline', `Failed to create snapshot for ${params.item.name}: ${error}`); + } + } } /** diff --git a/backend/src/scrapers/dutchie-graphql-direct.ts b/backend/src/scrapers/dutchie-graphql-direct.ts new file mode 100644 index 00000000..4494f90b --- /dev/null +++ b/backend/src/scrapers/dutchie-graphql-direct.ts @@ -0,0 +1,439 @@ +// ============================================================================ +// DEPRECATED: This scraper writes to the LEGACY products table. +// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline. +// +// New pipeline location: src/dutchie-az/services/product-crawler.ts +// - Uses fetch-based GraphQL (no Puppeteer needed) +// - Writes to isolated dutchie_az_* tables with snapshot model +// - Tracks stockStatus, isPresentInFeed, missing_from_feed +// ============================================================================ + +/** + * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead. + * This scraper writes to the legacy products table, not the new dutchie_az tables. + * + * Makes direct GraphQL requests from within the browser context to: + * 1. Bypass Cloudflare (using browser session) + * 2. Fetch ALL products including out-of-stock (Status: null) + * 3. Paginate through complete menu + */ + +import puppeteer from 'puppeteer-extra'; +import type { Browser, Page } from 'puppeteer'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; +import { Pool } from 'pg'; +import { DutchieProduct, NormalizedProduct, normalizeDutchieProduct } from './dutchie-graphql'; + +puppeteer.use(StealthPlugin()); + +// GraphQL persisted query hashes +const GRAPHQL_HASHES = { + FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0', + GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b', +}; + +interface FetchResult { + products: DutchieProduct[]; + dispensaryId: string; + totalProducts: number; + activeCount: number; + inactiveCount: number; +} + +/** + * Fetch all products via in-page GraphQL requests + * This includes both in-stock and out-of-stock items + */ +export async function fetchAllDutchieProducts( + menuUrl: string, + options: { + headless?: boolean | 'new'; + timeout?: number; + perPage?: number; + includeOutOfStock?: boolean; + } = {} +): Promise { + const { + headless = 'new', + timeout = 90000, + perPage = 100, + includeOutOfStock = true, + } = options; + + let browser: Browser | undefined; + + try { + browser = await puppeteer.launch({ + headless, + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-blink-features=AutomationControlled', + ], + }); + + const page = await browser.newPage(); + + // Stealth configuration + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + ); + await page.setViewport({ width: 1920, height: 1080 }); + await page.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + (window as any).chrome = { runtime: {} }; + }); + + // Navigate to menu page to establish session + console.log('[DutchieGraphQL] Loading menu page to establish session...'); + await page.goto(menuUrl, { + waitUntil: 'networkidle2', + timeout, + }); + + // Get dispensary ID from page + const dispensaryId = await page.evaluate(() => { + const env = (window as any).reactEnv; + return env?.dispensaryId || env?.retailerId || ''; + }); + + if (!dispensaryId) { + throw new Error('Could not determine dispensaryId from page'); + } + + console.log(`[DutchieGraphQL] Dispensary ID: ${dispensaryId}`); + + // Fetch all products via in-page GraphQL requests + const allProducts: DutchieProduct[] = []; + let page_num = 0; + let hasMore = true; + + while (hasMore) { + console.log(`[DutchieGraphQL] Fetching page ${page_num} (perPage=${perPage})...`); + + const result = await page.evaluate( + async (dispensaryId: string, page_num: number, perPage: number, includeOutOfStock: boolean, hash: string) => { + const variables = { + includeEnterpriseSpecials: false, + productsFilter: { + dispensaryId, + pricingType: 'rec', + Status: includeOutOfStock ? null : 'Active', // null = include out-of-stock + types: [], + useCache: false, // Don't cache to get fresh data + isDefaultSort: true, + sortBy: 'popularSortIdx', + sortDirection: 1, + bypassOnlineThresholds: true, + isKioskMenu: false, + removeProductsBelowOptionThresholds: false, + }, + page: page_num, + perPage, + }; + + const qs = new URLSearchParams({ + operationName: 'FilteredProducts', + variables: JSON.stringify(variables), + extensions: JSON.stringify({ + persistedQuery: { version: 1, sha256Hash: hash }, + }), + }); + + const response = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, { + method: 'GET', + headers: { + 'content-type': 'application/json', + 'apollographql-client-name': 'Marketplace (production)', + }, + credentials: 'include', // Include cookies/session + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}`); + } + + return response.json(); + }, + dispensaryId, + page_num, + perPage, + includeOutOfStock, + GRAPHQL_HASHES.FilteredProducts + ); + + if (result.errors) { + console.error('[DutchieGraphQL] GraphQL errors:', result.errors); + break; + } + + const products = result?.data?.filteredProducts?.products || []; + console.log(`[DutchieGraphQL] Page ${page_num}: ${products.length} products`); + + if (products.length === 0) { + hasMore = false; + } else { + allProducts.push(...products); + page_num++; + + // Safety limit + if (page_num > 50) { + console.log('[DutchieGraphQL] Reached page limit, stopping'); + hasMore = false; + } + } + } + + // Count active vs inactive + const activeCount = allProducts.filter((p) => p.Status === 'Active').length; + const inactiveCount = allProducts.filter((p) => p.Status !== 'Active').length; + + console.log(`[DutchieGraphQL] Total: ${allProducts.length} products (${activeCount} active, ${inactiveCount} inactive)`); + + return { + products: allProducts, + dispensaryId, + totalProducts: allProducts.length, + activeCount, + inactiveCount, + }; + } finally { + if (browser) { + await browser.close(); + } + } +} + +/** + * Upsert products to database + */ +export async function upsertProductsDirect( + pool: Pool, + storeId: number, + products: NormalizedProduct[] +): Promise<{ inserted: number; updated: number }> { + const client = await pool.connect(); + let inserted = 0; + let updated = 0; + + try { + await client.query('BEGIN'); + + for (const product of products) { + const result = await client.query( + ` + INSERT INTO products ( + store_id, external_id, slug, name, enterprise_product_id, + brand, brand_external_id, brand_logo_url, + subcategory, strain_type, canonical_category, + price, rec_price, med_price, rec_special_price, med_special_price, + is_on_special, special_name, discount_percent, special_data, + sku, inventory_quantity, inventory_available, is_below_threshold, status, + thc_percentage, cbd_percentage, cannabinoids, + weight_mg, net_weight_value, net_weight_unit, options, raw_options, + image_url, additional_images, + is_featured, medical_only, rec_only, + source_created_at, source_updated_at, + description, raw_data, + dutchie_url, last_seen_at, updated_at + ) + VALUES ( + $1, $2, $3, $4, $5, + $6, $7, $8, + $9, $10, $11, + $12, $13, $14, $15, $16, + $17, $18, $19, $20, + $21, $22, $23, $24, $25, + $26, $27, $28, + $29, $30, $31, $32, $33, + $34, $35, + $36, $37, $38, + $39, $40, + $41, $42, + '', NOW(), NOW() + ) + ON CONFLICT (store_id, slug) DO UPDATE SET + name = EXCLUDED.name, + enterprise_product_id = EXCLUDED.enterprise_product_id, + brand = EXCLUDED.brand, + brand_external_id = EXCLUDED.brand_external_id, + brand_logo_url = EXCLUDED.brand_logo_url, + subcategory = EXCLUDED.subcategory, + strain_type = EXCLUDED.strain_type, + canonical_category = EXCLUDED.canonical_category, + price = EXCLUDED.price, + rec_price = EXCLUDED.rec_price, + med_price = EXCLUDED.med_price, + rec_special_price = EXCLUDED.rec_special_price, + med_special_price = EXCLUDED.med_special_price, + is_on_special = EXCLUDED.is_on_special, + special_name = EXCLUDED.special_name, + discount_percent = EXCLUDED.discount_percent, + special_data = EXCLUDED.special_data, + sku = EXCLUDED.sku, + inventory_quantity = EXCLUDED.inventory_quantity, + inventory_available = EXCLUDED.inventory_available, + is_below_threshold = EXCLUDED.is_below_threshold, + status = EXCLUDED.status, + thc_percentage = EXCLUDED.thc_percentage, + cbd_percentage = EXCLUDED.cbd_percentage, + cannabinoids = EXCLUDED.cannabinoids, + weight_mg = EXCLUDED.weight_mg, + net_weight_value = EXCLUDED.net_weight_value, + net_weight_unit = EXCLUDED.net_weight_unit, + options = EXCLUDED.options, + raw_options = EXCLUDED.raw_options, + image_url = EXCLUDED.image_url, + additional_images = EXCLUDED.additional_images, + is_featured = EXCLUDED.is_featured, + medical_only = EXCLUDED.medical_only, + rec_only = EXCLUDED.rec_only, + source_created_at = EXCLUDED.source_created_at, + source_updated_at = EXCLUDED.source_updated_at, + description = EXCLUDED.description, + raw_data = EXCLUDED.raw_data, + last_seen_at = NOW(), + updated_at = NOW() + RETURNING (xmax = 0) AS was_inserted + `, + [ + storeId, + product.external_id, + product.slug, + product.name, + product.enterprise_product_id, + product.brand, + product.brand_external_id, + product.brand_logo_url, + product.subcategory, + product.strain_type, + product.canonical_category, + product.price, + product.rec_price, + product.med_price, + product.rec_special_price, + product.med_special_price, + product.is_on_special, + product.special_name, + product.discount_percent, + product.special_data ? JSON.stringify(product.special_data) : null, + product.sku, + product.inventory_quantity, + product.inventory_available, + product.is_below_threshold, + product.status, + product.thc_percentage, + product.cbd_percentage, + product.cannabinoids ? JSON.stringify(product.cannabinoids) : null, + product.weight_mg, + product.net_weight_value, + product.net_weight_unit, + product.options, + product.raw_options, + product.image_url, + product.additional_images, + product.is_featured, + product.medical_only, + product.rec_only, + product.source_created_at, + product.source_updated_at, + product.description, + product.raw_data ? JSON.stringify(product.raw_data) : null, + ] + ); + + if (result.rows[0]?.was_inserted) { + inserted++; + } else { + updated++; + } + } + + await client.query('COMMIT'); + return { inserted, updated }; + } catch (error) { + await client.query('ROLLBACK'); + throw error; + } finally { + client.release(); + } +} + +/** + * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead. + * This function is disabled and will throw an error if called. + * Main entry point - scrape all products including out-of-stock + */ +export async function scrapeAllDutchieProducts( + pool: Pool, + storeId: number, + menuUrl: string +): Promise<{ + success: boolean; + totalProducts: number; + activeCount: number; + inactiveCount: number; + inserted: number; + updated: number; + error?: string; +}> { + // DEPRECATED: Throw error to prevent accidental use + throw new Error( + 'DEPRECATED: scrapeAllDutchieProducts() is deprecated. ' + + 'Use src/dutchie-az/services/product-crawler.ts instead. ' + + 'This scraper writes to the legacy products table.' + ); + + // Original code below is unreachable but kept for reference + try { + console.log(`[DutchieGraphQL] Scraping ALL products (including out-of-stock): ${menuUrl}`); + + // Fetch all products via direct GraphQL + const { products, totalProducts, activeCount, inactiveCount } = await fetchAllDutchieProducts(menuUrl, { + includeOutOfStock: true, + perPage: 100, + }); + + if (products.length === 0) { + return { + success: false, + totalProducts: 0, + activeCount: 0, + inactiveCount: 0, + inserted: 0, + updated: 0, + error: 'No products returned from GraphQL', + }; + } + + // Normalize products + const normalized = products.map(normalizeDutchieProduct); + + // Upsert to database + const { inserted, updated } = await upsertProductsDirect(pool, storeId, normalized); + + console.log(`[DutchieGraphQL] Complete: ${totalProducts} products (${activeCount} active, ${inactiveCount} inactive)`); + console.log(`[DutchieGraphQL] Database: ${inserted} inserted, ${updated} updated`); + + return { + success: true, + totalProducts, + activeCount, + inactiveCount, + inserted, + updated, + }; + } catch (error: any) { + console.error(`[DutchieGraphQL] Error:`, error.message); + return { + success: false, + totalProducts: 0, + activeCount: 0, + inactiveCount: 0, + inserted: 0, + updated: 0, + error: error.message, + }; + } +} diff --git a/backend/src/scrapers/dutchie-graphql.ts b/backend/src/scrapers/dutchie-graphql.ts new file mode 100644 index 00000000..fa5fd294 --- /dev/null +++ b/backend/src/scrapers/dutchie-graphql.ts @@ -0,0 +1,711 @@ +// ============================================================================ +// DEPRECATED: This scraper writes to the LEGACY products table. +// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline. +// +// New pipeline location: src/dutchie-az/services/product-crawler.ts +// - Uses fetch-based GraphQL (no Puppeteer needed) +// - Writes to isolated dutchie_az_* tables with snapshot model +// - Tracks stockStatus, isPresentInFeed, missing_from_feed +// +// The normalizer functions in this file (normalizeDutchieProduct) may still +// be imported for reference, but do NOT call scrapeDutchieMenu() or upsertProducts(). +// ============================================================================ + +/** + * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead. + * This scraper writes to the legacy products table, not the new dutchie_az tables. + * + * Fetches product data via Puppeteer interception of Dutchie's GraphQL API. + * This bypasses Cloudflare by using a real browser to load the menu page. + * + * GraphQL Operations: + * - FilteredProducts: Returns paginated product list with full details + * - GetAddressBasedDispensaryData: Resolves dispensary cName to dispensaryId + */ + +import puppeteer from 'puppeteer-extra'; +import type { Browser, Page, HTTPResponse } from 'puppeteer'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; +import { Pool } from 'pg'; + +puppeteer.use(StealthPlugin()); + +// ===================================================== +// TYPE DEFINITIONS (from captured GraphQL schema) +// ===================================================== + +export interface DutchieProduct { + _id: string; + id: string; + Name: string; + cName: string; // URL slug + enterpriseProductId?: string; + DispensaryID: string; + + // Brand + brand?: { + id: string; + name: string; + imageUrl?: string; + description?: string; + }; + brandId?: string; + brandName?: string; + brandLogo?: string; + + // Category + type?: string; // e.g., "Edible", "Flower" + subcategory?: string; // e.g., "gummies", "pre-rolls" + strainType?: string; // "Indica", "Sativa", "Hybrid", "N/A" + + // Pricing (arrays - first element is primary) + Prices?: number[]; + recPrices?: number[]; + medicalPrices?: number[]; + recSpecialPrices?: number[]; + medicalSpecialPrices?: number[]; + + // Specials + special?: boolean; + specialData?: { + saleSpecials?: Array<{ + specialId: string; + specialName: string; + discount: number; + percentDiscount: boolean; + dollarDiscount: boolean; + specialType: string; + }>; + bogoSpecials?: any; + }; + + // Inventory + POSMetaData?: { + canonicalSKU?: string; + canonicalCategory?: string; + canonicalName?: string; + canonicalLabResultUrl?: string; + children?: Array<{ + option: string; + price: number; + quantity: number; + quantityAvailable: number; + recPrice?: number; + medPrice?: number; + }>; + }; + Status?: string; // "Active" or "Inactive" + isBelowThreshold?: boolean; + + // Potency + THCContent?: { + unit: string; + range: number[]; + }; + CBDContent?: { + unit: string; + range: number[]; + }; + cannabinoidsV2?: Array<{ + value: number; + unit: string; + cannabinoid: { + name: string; + }; + }>; + + // Weight/Options + Options?: string[]; + rawOptions?: string[]; + weight?: number; + measurements?: { + netWeight?: { + unit: string; + values: number[]; + }; + volume?: any; + }; + + // Images + Image?: string; + images?: string[]; + + // Flags + featured?: boolean; + medicalOnly?: boolean; + recOnly?: boolean; + + // Timestamps + createdAt?: string; + updatedAt?: string; + + // Description + description?: string; + effects?: Record; + terpenes?: any[]; +} + +// Database product row +export interface NormalizedProduct { + external_id: string; + slug: string; + name: string; + enterprise_product_id?: string; + + // Brand + brand?: string; + brand_external_id?: string; + brand_logo_url?: string; + + // Category + subcategory?: string; + strain_type?: string; + canonical_category?: string; + + // Pricing + price?: number; + rec_price?: number; + med_price?: number; + rec_special_price?: number; + med_special_price?: number; + + // Specials + is_on_special: boolean; + special_name?: string; + discount_percent?: number; + special_data?: any; + + // Inventory + sku?: string; + inventory_quantity?: number; + inventory_available?: number; + is_below_threshold: boolean; + status?: string; + + // Potency + thc_percentage?: number; + cbd_percentage?: number; + cannabinoids?: any; + + // Weight/Options + weight_mg?: number; + net_weight_value?: number; + net_weight_unit?: string; + options?: string[]; + raw_options?: string[]; + + // Images + image_url?: string; + additional_images?: string[]; + + // Flags + is_featured: boolean; + medical_only: boolean; + rec_only: boolean; + + // Timestamps + source_created_at?: Date; + source_updated_at?: Date; + + // Raw + description?: string; + raw_data?: any; +} + +// ===================================================== +// NORMALIZER: Dutchie GraphQL → DB Schema +// ===================================================== + +export function normalizeDutchieProduct(product: DutchieProduct): NormalizedProduct { + // Extract first special if exists + const saleSpecial = product.specialData?.saleSpecials?.[0]; + + // Calculate inventory from POSMetaData children + const children = product.POSMetaData?.children || []; + const totalQuantity = children.reduce((sum, c) => sum + (c.quantity || 0), 0); + const availableQuantity = children.reduce((sum, c) => sum + (c.quantityAvailable || 0), 0); + + // Parse timestamps + let sourceCreatedAt: Date | undefined; + if (product.createdAt) { + // createdAt is a timestamp string like "1729044510543" + const ts = parseInt(product.createdAt, 10); + if (!isNaN(ts)) { + sourceCreatedAt = new Date(ts); + } + } + + let sourceUpdatedAt: Date | undefined; + if (product.updatedAt) { + sourceUpdatedAt = new Date(product.updatedAt); + } + + return { + // Identity + external_id: product._id || product.id, + slug: product.cName, + name: product.Name, + enterprise_product_id: product.enterpriseProductId, + + // Brand + brand: product.brandName || product.brand?.name, + brand_external_id: product.brandId || product.brand?.id, + brand_logo_url: product.brandLogo || product.brand?.imageUrl, + + // Category + subcategory: product.subcategory, + strain_type: product.strainType, + canonical_category: product.POSMetaData?.canonicalCategory, + + // Pricing + price: product.Prices?.[0], + rec_price: product.recPrices?.[0], + med_price: product.medicalPrices?.[0], + rec_special_price: product.recSpecialPrices?.[0], + med_special_price: product.medicalSpecialPrices?.[0], + + // Specials + is_on_special: product.special === true, + special_name: saleSpecial?.specialName, + discount_percent: saleSpecial?.percentDiscount ? saleSpecial.discount : undefined, + special_data: product.specialData, + + // Inventory + sku: product.POSMetaData?.canonicalSKU, + inventory_quantity: totalQuantity || undefined, + inventory_available: availableQuantity || undefined, + is_below_threshold: product.isBelowThreshold === true, + status: product.Status, + + // Potency + thc_percentage: product.THCContent?.range?.[0], + cbd_percentage: product.CBDContent?.range?.[0], + cannabinoids: product.cannabinoidsV2, + + // Weight/Options + weight_mg: product.weight, + net_weight_value: product.measurements?.netWeight?.values?.[0], + net_weight_unit: product.measurements?.netWeight?.unit, + options: product.Options, + raw_options: product.rawOptions, + + // Images + image_url: product.Image, + additional_images: product.images?.length ? product.images : undefined, + + // Flags + is_featured: product.featured === true, + medical_only: product.medicalOnly === true, + rec_only: product.recOnly === true, + + // Timestamps + source_created_at: sourceCreatedAt, + source_updated_at: sourceUpdatedAt, + + // Description + description: typeof product.description === 'string' ? product.description : undefined, + + // Raw + raw_data: product, + }; +} + +// ===================================================== +// PUPPETEER SCRAPER +// ===================================================== + +interface CapturedProducts { + products: DutchieProduct[]; + dispensaryId: string; + menuUrl: string; +} + +export async function fetchDutchieMenuViaPuppeteer( + menuUrl: string, + options: { + headless?: boolean | 'new'; + timeout?: number; + maxScrolls?: number; + } = {} +): Promise { + const { + headless = 'new', + timeout = 90000, + maxScrolls = 30, // Increased for full menu capture + } = options; + + let browser: Browser | undefined; + const capturedProducts: DutchieProduct[] = []; + let dispensaryId = ''; + + try { + browser = await puppeteer.launch({ + headless, + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-blink-features=AutomationControlled', + ], + }); + + const page = await browser.newPage(); + + // Stealth configuration + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + ); + await page.setViewport({ width: 1920, height: 1080 }); + await page.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + (window as any).chrome = { runtime: {} }; + }); + + // Track seen product IDs to avoid duplicates + const seenIds = new Set(); + + // Intercept GraphQL responses + page.on('response', async (response: HTTPResponse) => { + const url = response.url(); + if (!url.includes('graphql')) return; + + try { + const contentType = response.headers()['content-type'] || ''; + if (!contentType.includes('application/json')) return; + + const data = await response.json(); + + // Capture dispensary ID + if (data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId) { + dispensaryId = data.data.getAddressBasedDispensaryData.dispensaryData.dispensaryId; + } + + // Capture products from FilteredProducts + if (data?.data?.filteredProducts?.products) { + const products = data.data.filteredProducts.products as DutchieProduct[]; + for (const product of products) { + if (!seenIds.has(product._id)) { + seenIds.add(product._id); + capturedProducts.push(product); + } + } + } + } catch { + // Ignore parse errors + } + }); + + // Navigate to menu + console.log('[DutchieGraphQL] Loading menu page...'); + await page.goto(menuUrl, { + waitUntil: 'networkidle2', + timeout, + }); + + // Get dispensary ID from window.reactEnv if not captured + if (!dispensaryId) { + dispensaryId = await page.evaluate(() => { + const env = (window as any).reactEnv; + return env?.dispensaryId || env?.retailerId || ''; + }); + } + + // Helper function to scroll through a page until no more products load + async function scrollToLoadAll(maxScrollAttempts: number = maxScrolls): Promise { + let scrollCount = 0; + let previousCount = 0; + let noNewProductsCount = 0; + + while (scrollCount < maxScrollAttempts && noNewProductsCount < 3) { + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await new Promise((r) => setTimeout(r, 1500)); + + const currentCount = seenIds.size; + if (currentCount === previousCount) { + noNewProductsCount++; + } else { + noNewProductsCount = 0; + } + previousCount = currentCount; + scrollCount++; + } + } + + // First, scroll through the main page (all products) + console.log('[DutchieGraphQL] Scrolling main page...'); + await scrollToLoadAll(); + console.log(`[DutchieGraphQL] After main page: ${seenIds.size} products`); + + // Get category links from the navigation + const categoryLinks = await page.evaluate(() => { + const links: string[] = []; + // Look for category navigation links + const navLinks = document.querySelectorAll('a[href*="/products/"]'); + navLinks.forEach((link) => { + const href = (link as HTMLAnchorElement).href; + if (href && !links.includes(href)) { + links.push(href); + } + }); + return links; + }); + + console.log(`[DutchieGraphQL] Found ${categoryLinks.length} category links`); + + // Visit each category page to capture all products + for (const categoryUrl of categoryLinks) { + try { + console.log(`[DutchieGraphQL] Visiting category: ${categoryUrl.split('/').pop()}`); + await page.goto(categoryUrl, { + waitUntil: 'networkidle2', + timeout: 30000, + }); + await scrollToLoadAll(15); // Fewer scrolls per category + console.log(`[DutchieGraphQL] Total products: ${seenIds.size}`); + } catch (e: any) { + console.log(`[DutchieGraphQL] Category error: ${e.message}`); + } + } + + // Wait for any final responses + await new Promise((r) => setTimeout(r, 2000)); + + return { + products: capturedProducts, + dispensaryId, + menuUrl, + }; + } finally { + if (browser) { + await browser.close(); + } + } +} + +// ===================================================== +// DATABASE OPERATIONS +// ===================================================== + +export async function upsertProducts( + pool: Pool, + storeId: number, + products: NormalizedProduct[] +): Promise<{ inserted: number; updated: number }> { + const client = await pool.connect(); + let inserted = 0; + let updated = 0; + + try { + await client.query('BEGIN'); + + for (const product of products) { + // Upsert product + const result = await client.query( + ` + INSERT INTO products ( + store_id, external_id, slug, name, enterprise_product_id, + brand, brand_external_id, brand_logo_url, + subcategory, strain_type, canonical_category, + price, rec_price, med_price, rec_special_price, med_special_price, + is_on_special, special_name, discount_percent, special_data, + sku, inventory_quantity, inventory_available, is_below_threshold, status, + thc_percentage, cbd_percentage, cannabinoids, + weight_mg, net_weight_value, net_weight_unit, options, raw_options, + image_url, additional_images, + is_featured, medical_only, rec_only, + source_created_at, source_updated_at, + description, raw_data, + dutchie_url, last_seen_at, updated_at + ) + VALUES ( + $1, $2, $3, $4, $5, + $6, $7, $8, + $9, $10, $11, + $12, $13, $14, $15, $16, + $17, $18, $19, $20, + $21, $22, $23, $24, $25, + $26, $27, $28, + $29, $30, $31, $32, $33, + $34, $35, + $36, $37, $38, + $39, $40, + $41, $42, + '', NOW(), NOW() + ) + ON CONFLICT (store_id, slug) DO UPDATE SET + name = EXCLUDED.name, + enterprise_product_id = EXCLUDED.enterprise_product_id, + brand = EXCLUDED.brand, + brand_external_id = EXCLUDED.brand_external_id, + brand_logo_url = EXCLUDED.brand_logo_url, + subcategory = EXCLUDED.subcategory, + strain_type = EXCLUDED.strain_type, + canonical_category = EXCLUDED.canonical_category, + price = EXCLUDED.price, + rec_price = EXCLUDED.rec_price, + med_price = EXCLUDED.med_price, + rec_special_price = EXCLUDED.rec_special_price, + med_special_price = EXCLUDED.med_special_price, + is_on_special = EXCLUDED.is_on_special, + special_name = EXCLUDED.special_name, + discount_percent = EXCLUDED.discount_percent, + special_data = EXCLUDED.special_data, + sku = EXCLUDED.sku, + inventory_quantity = EXCLUDED.inventory_quantity, + inventory_available = EXCLUDED.inventory_available, + is_below_threshold = EXCLUDED.is_below_threshold, + status = EXCLUDED.status, + thc_percentage = EXCLUDED.thc_percentage, + cbd_percentage = EXCLUDED.cbd_percentage, + cannabinoids = EXCLUDED.cannabinoids, + weight_mg = EXCLUDED.weight_mg, + net_weight_value = EXCLUDED.net_weight_value, + net_weight_unit = EXCLUDED.net_weight_unit, + options = EXCLUDED.options, + raw_options = EXCLUDED.raw_options, + image_url = EXCLUDED.image_url, + additional_images = EXCLUDED.additional_images, + is_featured = EXCLUDED.is_featured, + medical_only = EXCLUDED.medical_only, + rec_only = EXCLUDED.rec_only, + source_created_at = EXCLUDED.source_created_at, + source_updated_at = EXCLUDED.source_updated_at, + description = EXCLUDED.description, + raw_data = EXCLUDED.raw_data, + last_seen_at = NOW(), + updated_at = NOW() + RETURNING (xmax = 0) AS was_inserted + `, + [ + storeId, + product.external_id, + product.slug, + product.name, + product.enterprise_product_id, + product.brand, + product.brand_external_id, + product.brand_logo_url, + product.subcategory, + product.strain_type, + product.canonical_category, + product.price, + product.rec_price, + product.med_price, + product.rec_special_price, + product.med_special_price, + product.is_on_special, + product.special_name, + product.discount_percent, + product.special_data ? JSON.stringify(product.special_data) : null, + product.sku, + product.inventory_quantity, + product.inventory_available, + product.is_below_threshold, + product.status, + product.thc_percentage, + product.cbd_percentage, + product.cannabinoids ? JSON.stringify(product.cannabinoids) : null, + product.weight_mg, + product.net_weight_value, + product.net_weight_unit, + product.options, + product.raw_options, + product.image_url, + product.additional_images, + product.is_featured, + product.medical_only, + product.rec_only, + product.source_created_at, + product.source_updated_at, + product.description, + product.raw_data ? JSON.stringify(product.raw_data) : null, + ] + ); + + if (result.rows[0]?.was_inserted) { + inserted++; + } else { + updated++; + } + } + + await client.query('COMMIT'); + return { inserted, updated }; + } catch (error) { + await client.query('ROLLBACK'); + throw error; + } finally { + client.release(); + } +} + +// ===================================================== +// MAIN ENTRY POINT +// ===================================================== + +/** + * @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead. + * This function is disabled and will throw an error if called. + */ +export async function scrapeDutchieMenu( + pool: Pool, + storeId: number, + menuUrl: string +): Promise<{ + success: boolean; + productsFound: number; + inserted: number; + updated: number; + error?: string; +}> { + // DEPRECATED: Throw error to prevent accidental use + throw new Error( + 'DEPRECATED: scrapeDutchieMenu() is deprecated. ' + + 'Use src/dutchie-az/services/product-crawler.ts instead. ' + + 'This scraper writes to the legacy products table.' + ); + + // Original code below is unreachable but kept for reference + try { + console.log(`[DutchieGraphQL] Scraping: ${menuUrl}`); + + // Fetch products via Puppeteer + const { products, dispensaryId } = await fetchDutchieMenuViaPuppeteer(menuUrl); + + console.log(`[DutchieGraphQL] Captured ${products.length} products, dispensaryId: ${dispensaryId}`); + + if (products.length === 0) { + return { + success: false, + productsFound: 0, + inserted: 0, + updated: 0, + error: 'No products captured from GraphQL responses', + }; + } + + // Normalize products + const normalized = products.map(normalizeDutchieProduct); + + // Upsert to database + const { inserted, updated } = await upsertProducts(pool, storeId, normalized); + + console.log(`[DutchieGraphQL] Upsert complete: ${inserted} inserted, ${updated} updated`); + + return { + success: true, + productsFound: products.length, + inserted, + updated, + }; + } catch (error: any) { + console.error(`[DutchieGraphQL] Error:`, error.message); + return { + success: false, + productsFound: 0, + inserted: 0, + updated: 0, + error: error.message, + }; + } +} diff --git a/backend/src/scrapers/templates/dutchie.ts b/backend/src/scrapers/templates/dutchie.ts index a2f22e1b..3a5ae81e 100644 --- a/backend/src/scrapers/templates/dutchie.ts +++ b/backend/src/scrapers/templates/dutchie.ts @@ -1,3 +1,9 @@ +// ============================================================================ +// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline) +// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table. +// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts +// ============================================================================ + import { Page } from 'playwright'; import { logger } from '../../services/logger'; @@ -9,8 +15,9 @@ export interface ScraperTemplate { } /** - * Dutchie marketplace scraper template - * Used for: dutchie.com/dispensary/* URLs + * @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported. + * Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts + * This template relied on unstable DOM selectors and wrote to legacy tables. */ export const dutchieTemplate: ScraperTemplate = { name: 'Dutchie Marketplace', diff --git a/backend/src/scripts/capture-dutchie-schema.ts b/backend/src/scripts/capture-dutchie-schema.ts new file mode 100644 index 00000000..0f79fd5e --- /dev/null +++ b/backend/src/scripts/capture-dutchie-schema.ts @@ -0,0 +1,236 @@ +/** + * Capture Dutchie GraphQL response structure via Puppeteer interception + * This script navigates to a Dutchie menu page and captures the GraphQL responses + * to understand the exact product data structure + */ + +import puppeteer from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; +import * as fs from 'fs'; + +puppeteer.use(StealthPlugin()); + +interface CapturedResponse { + operationName: string; + url: string; + data: any; + timestamp: Date; +} + +async function captureSchema(menuUrl: string) { + let browser; + const capturedResponses: CapturedResponse[] = []; + + try { + console.log('='.repeat(80)); + console.log('DUTCHIE GRAPHQL SCHEMA CAPTURE'); + console.log('='.repeat(80)); + console.log(`\nTarget URL: ${menuUrl}\n`); + + browser = await puppeteer.launch({ + headless: 'new', + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-dev-shm-usage', + '--disable-blink-features=AutomationControlled', + ] + }); + + const page = await browser.newPage(); + + // Use a realistic user agent + await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'); + + // Set viewport to desktop size + await page.setViewport({ width: 1920, height: 1080 }); + + // Hide webdriver flag + await page.evaluateOnNewDocument(() => { + Object.defineProperty(navigator, 'webdriver', { get: () => false }); + (window as any).chrome = { runtime: {} }; + }); + + // Intercept all GraphQL responses + page.on('response', async (response) => { + const url = response.url(); + + // Only capture GraphQL responses + if (!url.includes('graphql')) return; + + try { + const contentType = response.headers()['content-type'] || ''; + if (!contentType.includes('application/json')) return; + + const data = await response.json(); + + // Extract operation name from URL if possible + const urlParams = new URLSearchParams(url.split('?')[1] || ''); + const operationName = urlParams.get('operationName') || 'Unknown'; + + capturedResponses.push({ + operationName, + url: url.substring(0, 200), + data, + timestamp: new Date() + }); + + console.log(`📡 Captured: ${operationName}`); + + // Check for product data + if (data?.data?.filteredProducts?.products) { + const products = data.data.filteredProducts.products; + console.log(` Found ${products.length} products`); + } + } catch (e) { + // Ignore parse errors + } + }); + + console.log('Navigating to page...'); + await page.goto(menuUrl, { + waitUntil: 'networkidle2', + timeout: 90000 + }); + + // Check if it's a Dutchie menu + const isDutchie = await page.evaluate(() => { + return typeof (window as any).reactEnv !== 'undefined'; + }); + + if (isDutchie) { + console.log('✅ Dutchie menu detected\n'); + + // Get environment info + const reactEnv = await page.evaluate(() => (window as any).reactEnv); + console.log('Dutchie Environment:'); + console.log(` dispensaryId: ${reactEnv?.dispensaryId}`); + console.log(` retailerId: ${reactEnv?.retailerId}`); + console.log(` chainId: ${reactEnv?.chainId}`); + } + + // Scroll to trigger lazy loading + console.log('\nScrolling to load more products...'); + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight)); + await new Promise(r => setTimeout(r, 3000)); + + // Click on a category to trigger more loads + const categoryLinks = await page.$$('a[href*="/products/"]'); + if (categoryLinks.length > 0) { + console.log(`Found ${categoryLinks.length} category links, clicking first one...`); + try { + await categoryLinks[0].click(); + await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 }); + } catch (e) { + console.log('Category navigation failed, continuing...'); + } + } + + // Wait a bit more for any final responses + await new Promise(r => setTimeout(r, 2000)); + + console.log(`\n${'='.repeat(80)}`); + console.log(`CAPTURED ${capturedResponses.length} GRAPHQL RESPONSES`); + console.log('='.repeat(80)); + + // Find product data + let productSchema: any = null; + let sampleProduct: any = null; + + for (const resp of capturedResponses) { + console.log(`\n${resp.operationName}:`); + console.log(` URL: ${resp.url.substring(0, 100)}...`); + + if (resp.data?.data?.filteredProducts?.products) { + const products = resp.data.data.filteredProducts.products; + console.log(` ✅ Contains ${products.length} products`); + + if (products.length > 0 && !sampleProduct) { + sampleProduct = products[0]; + productSchema = extractSchema(products[0]); + } + } + + // Show top-level data keys + if (resp.data?.data) { + console.log(` Data keys: ${Object.keys(resp.data.data).join(', ')}`); + } + } + + // Output the product schema + if (productSchema) { + console.log('\n' + '='.repeat(80)); + console.log('PRODUCT SCHEMA (from first product):'); + console.log('='.repeat(80)); + console.log(JSON.stringify(productSchema, null, 2)); + + console.log('\n' + '='.repeat(80)); + console.log('SAMPLE PRODUCT:'); + console.log('='.repeat(80)); + console.log(JSON.stringify(sampleProduct, null, 2)); + + // Save to file + const outputData = { + capturedAt: new Date().toISOString(), + menuUrl, + schema: productSchema, + sampleProduct, + allResponses: capturedResponses.map(r => ({ + operationName: r.operationName, + dataKeys: r.data?.data ? Object.keys(r.data.data) : [], + productCount: r.data?.data?.filteredProducts?.products?.length || 0 + })) + }; + + const outputPath = '/tmp/dutchie-schema-capture.json'; + fs.writeFileSync(outputPath, JSON.stringify(outputData, null, 2)); + console.log(`\nSaved capture to: ${outputPath}`); + } else { + console.log('\n❌ No product data captured'); + + // Debug: show all responses + console.log('\nAll captured responses:'); + for (const resp of capturedResponses) { + console.log(`\n${resp.operationName}:`); + console.log(JSON.stringify(resp.data, null, 2).substring(0, 500)); + } + } + + } catch (error: any) { + console.error('Error:', error.message); + } finally { + if (browser) { + await browser.close(); + } + } +} + +/** + * Extract schema from an object (field names + types) + */ +function extractSchema(obj: any, prefix = ''): any { + if (obj === null) return { type: 'null' }; + if (obj === undefined) return { type: 'undefined' }; + + if (Array.isArray(obj)) { + if (obj.length === 0) return { type: 'array', items: 'unknown' }; + return { + type: 'array', + items: extractSchema(obj[0], prefix + '[]') + }; + } + + if (typeof obj === 'object') { + const schema: any = { type: 'object', properties: {} }; + for (const [key, value] of Object.entries(obj)) { + schema.properties[key] = extractSchema(value, prefix ? `${prefix}.${key}` : key); + } + return schema; + } + + return { type: typeof obj, example: String(obj).substring(0, 100) }; +} + +// Run +const url = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted'; +captureSchema(url).catch(console.error); diff --git a/backend/src/scripts/crawl-all-dutchie.ts b/backend/src/scripts/crawl-all-dutchie.ts new file mode 100644 index 00000000..d8575d30 --- /dev/null +++ b/backend/src/scripts/crawl-all-dutchie.ts @@ -0,0 +1,66 @@ +/** + * Seed crawl: trigger dutchie crawls for all dispensaries with menu_type='dutchie' + * and a resolved platform_dispensary_id. This uses the AZ orchestrator endpoint logic. + * + * Usage (local): + * node dist/scripts/crawl-all-dutchie.js + * + * Requires: + * - DATABASE_URL/CRAWLSY_DATABASE_URL pointing to the consolidated DB + * - Dispensaries table populated with menu_type and platform_dispensary_id + */ + +import { query } from '../dutchie-az/db/connection'; +import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator'; + +async function main() { + const { rows } = await query<{ + id: number; + name: string; + slug: string; + platform_dispensary_id: string | null; + }>(` + SELECT id, name, slug, platform_dispensary_id + FROM dispensaries + WHERE menu_type = 'dutchie' + AND platform_dispensary_id IS NOT NULL + ORDER BY id + `); + + if (!rows.length) { + console.log('No dutchie dispensaries with resolved platform_dispensary_id found.'); + process.exit(0); + } + + console.log(`Found ${rows.length} dutchie dispensaries with resolved IDs. Triggering crawls...`); + + let success = 0; + let failed = 0; + + for (const row of rows) { + try { + console.log(`Crawling ${row.id} (${row.name})...`); + const result = await runDispensaryOrchestrator(row.id); + const ok = + result.status === 'success' || + result.status === 'sandbox_only' || + result.status === 'detection_only'; + if (ok) { + success++; + } else { + failed++; + console.warn(`Crawl returned status ${result.status} for ${row.id} (${row.name})`); + } + } catch (err: any) { + failed++; + console.error(`Failed crawl for ${row.id} (${row.name}): ${err.message}`); + } + } + + console.log(`Completed. Success: ${success}, Failed: ${failed}`); +} + +main().catch((err) => { + console.error('Fatal:', err); + process.exit(1); +}); diff --git a/backend/src/scripts/run-dutchie-scrape.ts b/backend/src/scripts/run-dutchie-scrape.ts new file mode 100644 index 00000000..6682f7b3 --- /dev/null +++ b/backend/src/scripts/run-dutchie-scrape.ts @@ -0,0 +1,139 @@ +/** + * Run Dutchie GraphQL Scrape + * + * This script demonstrates the full pipeline: + * 1. Puppeteer navigates to Dutchie menu + * 2. GraphQL responses are intercepted + * 3. Products are normalized to our schema + * 4. Products are upserted to database + * 5. Derived views (brands, categories, specials) are automatically updated + */ + +import { Pool } from 'pg'; +import { scrapeDutchieMenu } from '../scrapers/dutchie-graphql'; + +const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus'; + +async function main() { + const pool = new Pool({ connectionString: DATABASE_URL }); + + try { + console.log('='.repeat(80)); + console.log('DUTCHIE GRAPHQL SCRAPER - FULL PIPELINE TEST'); + console.log('='.repeat(80)); + console.log(`Database: ${DATABASE_URL.replace(/:[^:@]+@/, ':***@')}`); + + // Configuration + const storeId = 1; // Deeply Rooted + const menuUrl = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted'; + + console.log(`\nStore ID: ${storeId}`); + console.log(`Menu URL: ${menuUrl}`); + console.log('\n' + '-'.repeat(80)); + + // Run the scrape + console.log('\n🚀 Starting scrape...\n'); + const result = await scrapeDutchieMenu(pool, storeId, menuUrl); + + console.log('\n' + '-'.repeat(80)); + console.log('📊 SCRAPE RESULTS:'); + console.log('-'.repeat(80)); + console.log(` Success: ${result.success}`); + console.log(` Products Found: ${result.productsFound}`); + console.log(` Inserted: ${result.inserted}`); + console.log(` Updated: ${result.updated}`); + if (result.error) { + console.log(` Error: ${result.error}`); + } + + // Query derived views to show the result + if (result.success) { + console.log('\n' + '-'.repeat(80)); + console.log('📈 DERIVED DATA (from products table):'); + console.log('-'.repeat(80)); + + // Brands + const brandsResult = await pool.query(` + SELECT brand_name, product_count, min_price, max_price + FROM derived_brands + WHERE store_id = $1 + ORDER BY product_count DESC + LIMIT 5 + `, [storeId]); + + console.log('\nTop 5 Brands:'); + brandsResult.rows.forEach(row => { + console.log(` - ${row.brand_name}: ${row.product_count} products ($${row.min_price} - $${row.max_price})`); + }); + + // Specials + const specialsResult = await pool.query(` + SELECT name, brand, rec_price, rec_special_price, discount_percent + FROM current_specials + WHERE store_id = $1 + LIMIT 5 + `, [storeId]); + + console.log('\nTop 5 Specials:'); + if (specialsResult.rows.length === 0) { + console.log(' (No specials found - is_on_special may not be populated yet)'); + } else { + specialsResult.rows.forEach(row => { + console.log(` - ${row.name} (${row.brand}): $${row.rec_price} → $${row.rec_special_price} (${row.discount_percent}% off)`); + }); + } + + // Categories + const categoriesResult = await pool.query(` + SELECT category_name, product_count + FROM derived_categories + WHERE store_id = $1 + ORDER BY product_count DESC + LIMIT 5 + `, [storeId]); + + console.log('\nTop 5 Categories:'); + if (categoriesResult.rows.length === 0) { + console.log(' (No categories found - subcategory may not be populated yet)'); + } else { + categoriesResult.rows.forEach(row => { + console.log(` - ${row.category_name}: ${row.product_count} products`); + }); + } + + // Sample product + const sampleResult = await pool.query(` + SELECT name, brand, subcategory, rec_price, rec_special_price, is_on_special, thc_percentage, status + FROM products + WHERE store_id = $1 AND subcategory IS NOT NULL + ORDER BY updated_at DESC + LIMIT 1 + `, [storeId]); + + if (sampleResult.rows.length > 0) { + const sample = sampleResult.rows[0]; + console.log('\nSample Product (with new fields):'); + console.log(` Name: ${sample.name}`); + console.log(` Brand: ${sample.brand}`); + console.log(` Category: ${sample.subcategory}`); + console.log(` Price: $${sample.rec_price}`); + console.log(` Sale Price: ${sample.rec_special_price ? `$${sample.rec_special_price}` : 'N/A'}`); + console.log(` On Special: ${sample.is_on_special}`); + console.log(` THC: ${sample.thc_percentage}%`); + console.log(` Status: ${sample.status}`); + } + } + + console.log('\n' + '='.repeat(80)); + console.log('✅ SCRAPE COMPLETE'); + console.log('='.repeat(80)); + + } catch (error: any) { + console.error('\n❌ Error:', error.message); + throw error; + } finally { + await pool.end(); + } +} + +main().catch(console.error); diff --git a/backend/src/scripts/scrape-all-active.ts b/backend/src/scripts/scrape-all-active.ts new file mode 100644 index 00000000..164d4bc2 --- /dev/null +++ b/backend/src/scripts/scrape-all-active.ts @@ -0,0 +1,319 @@ +/** + * Scrape ALL active products via direct GraphQL pagination + * This is more reliable than category navigation + */ + +import puppeteer from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; +import { Pool } from 'pg'; +import { normalizeDutchieProduct, DutchieProduct } from '../scrapers/dutchie-graphql'; + +puppeteer.use(StealthPlugin()); + +const DATABASE_URL = + process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus'; +const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0'; + +async function scrapeAllProducts(menuUrl: string, storeId: number) { + const pool = new Pool({ connectionString: DATABASE_URL }); + + const browser = await puppeteer.launch({ + headless: 'new', + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + + try { + const page = await browser.newPage(); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36' + ); + + console.log('Loading menu to establish session...'); + await page.goto(menuUrl, { + waitUntil: 'networkidle2', + timeout: 60000, + }); + await new Promise((r) => setTimeout(r, 3000)); + + const dispensaryId = await page.evaluate(() => (window as any).reactEnv?.dispensaryId); + console.log('Dispensary ID:', dispensaryId); + + // Paginate through all products + const allProducts: DutchieProduct[] = []; + let pageNum = 0; + const perPage = 100; + + console.log('\nFetching all products via paginated GraphQL...'); + + while (true) { + const result = await page.evaluate( + async (dispId: string, hash: string, page: number, perPage: number) => { + const variables = { + includeEnterpriseSpecials: false, + productsFilter: { + dispensaryId: dispId, + pricingType: 'rec', + Status: 'Active', + types: [], + useCache: false, + isDefaultSort: true, + sortBy: 'popularSortIdx', + sortDirection: 1, + bypassOnlineThresholds: true, + isKioskMenu: false, + removeProductsBelowOptionThresholds: false, + }, + page, + perPage, + }; + + const qs = new URLSearchParams({ + operationName: 'FilteredProducts', + variables: JSON.stringify(variables), + extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }), + }); + + const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, { + method: 'GET', + headers: { + 'content-type': 'application/json', + 'apollographql-client-name': 'Marketplace (production)', + }, + credentials: 'include', + }); + + const json = await resp.json(); + return { + products: json?.data?.filteredProducts?.products || [], + totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount, + }; + }, + dispensaryId, + GRAPHQL_HASH, + pageNum, + perPage + ); + + if (result.products.length === 0) { + break; + } + + allProducts.push(...result.products); + console.log( + `Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})` + ); + + pageNum++; + + // Safety limit + if (pageNum > 50) { + console.log('Reached page limit'); + break; + } + } + + console.log(`\nTotal products fetched: ${allProducts.length}`); + + // Normalize and upsert + console.log('\nNormalizing and upserting to database...'); + const normalized = allProducts.map(normalizeDutchieProduct); + + const client = await pool.connect(); + let inserted = 0; + let updated = 0; + + try { + await client.query('BEGIN'); + + for (const product of normalized) { + const result = await client.query( + ` + INSERT INTO products ( + store_id, external_id, slug, name, enterprise_product_id, + brand, brand_external_id, brand_logo_url, + subcategory, strain_type, canonical_category, + price, rec_price, med_price, rec_special_price, med_special_price, + is_on_special, special_name, discount_percent, special_data, + sku, inventory_quantity, inventory_available, is_below_threshold, status, + thc_percentage, cbd_percentage, cannabinoids, + weight_mg, net_weight_value, net_weight_unit, options, raw_options, + image_url, additional_images, + is_featured, medical_only, rec_only, + source_created_at, source_updated_at, + description, raw_data, + dutchie_url, last_seen_at, updated_at + ) + VALUES ( + $1, $2, $3, $4, $5, + $6, $7, $8, + $9, $10, $11, + $12, $13, $14, $15, $16, + $17, $18, $19, $20, + $21, $22, $23, $24, $25, + $26, $27, $28, + $29, $30, $31, $32, $33, + $34, $35, + $36, $37, $38, + $39, $40, + $41, $42, + '', NOW(), NOW() + ) + ON CONFLICT (store_id, slug) DO UPDATE SET + name = EXCLUDED.name, + enterprise_product_id = EXCLUDED.enterprise_product_id, + brand = EXCLUDED.brand, + brand_external_id = EXCLUDED.brand_external_id, + brand_logo_url = EXCLUDED.brand_logo_url, + subcategory = EXCLUDED.subcategory, + strain_type = EXCLUDED.strain_type, + canonical_category = EXCLUDED.canonical_category, + price = EXCLUDED.price, + rec_price = EXCLUDED.rec_price, + med_price = EXCLUDED.med_price, + rec_special_price = EXCLUDED.rec_special_price, + med_special_price = EXCLUDED.med_special_price, + is_on_special = EXCLUDED.is_on_special, + special_name = EXCLUDED.special_name, + discount_percent = EXCLUDED.discount_percent, + special_data = EXCLUDED.special_data, + sku = EXCLUDED.sku, + inventory_quantity = EXCLUDED.inventory_quantity, + inventory_available = EXCLUDED.inventory_available, + is_below_threshold = EXCLUDED.is_below_threshold, + status = EXCLUDED.status, + thc_percentage = EXCLUDED.thc_percentage, + cbd_percentage = EXCLUDED.cbd_percentage, + cannabinoids = EXCLUDED.cannabinoids, + weight_mg = EXCLUDED.weight_mg, + net_weight_value = EXCLUDED.net_weight_value, + net_weight_unit = EXCLUDED.net_weight_unit, + options = EXCLUDED.options, + raw_options = EXCLUDED.raw_options, + image_url = EXCLUDED.image_url, + additional_images = EXCLUDED.additional_images, + is_featured = EXCLUDED.is_featured, + medical_only = EXCLUDED.medical_only, + rec_only = EXCLUDED.rec_only, + source_created_at = EXCLUDED.source_created_at, + source_updated_at = EXCLUDED.source_updated_at, + description = EXCLUDED.description, + raw_data = EXCLUDED.raw_data, + last_seen_at = NOW(), + updated_at = NOW() + RETURNING (xmax = 0) AS was_inserted + `, + [ + storeId, + product.external_id, + product.slug, + product.name, + product.enterprise_product_id, + product.brand, + product.brand_external_id, + product.brand_logo_url, + product.subcategory, + product.strain_type, + product.canonical_category, + product.price, + product.rec_price, + product.med_price, + product.rec_special_price, + product.med_special_price, + product.is_on_special, + product.special_name, + product.discount_percent, + product.special_data ? JSON.stringify(product.special_data) : null, + product.sku, + product.inventory_quantity, + product.inventory_available, + product.is_below_threshold, + product.status, + product.thc_percentage, + product.cbd_percentage, + product.cannabinoids ? JSON.stringify(product.cannabinoids) : null, + product.weight_mg, + product.net_weight_value, + product.net_weight_unit, + product.options, + product.raw_options, + product.image_url, + product.additional_images, + product.is_featured, + product.medical_only, + product.rec_only, + product.source_created_at, + product.source_updated_at, + product.description, + product.raw_data ? JSON.stringify(product.raw_data) : null, + ] + ); + + if (result.rows[0]?.was_inserted) { + inserted++; + } else { + updated++; + } + } + + await client.query('COMMIT'); + } catch (error) { + await client.query('ROLLBACK'); + throw error; + } finally { + client.release(); + } + + console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`); + + // Show summary stats + const stats = await pool.query( + ` + SELECT + COUNT(*) as total, + COUNT(*) FILTER (WHERE is_on_special) as specials, + COUNT(DISTINCT brand) as brands, + COUNT(DISTINCT subcategory) as categories + FROM products WHERE store_id = $1 + `, + [storeId] + ); + + console.log('\nStore summary:'); + console.log(` Total products: ${stats.rows[0].total}`); + console.log(` On special: ${stats.rows[0].specials}`); + console.log(` Unique brands: ${stats.rows[0].brands}`); + console.log(` Categories: ${stats.rows[0].categories}`); + + return { + success: true, + totalProducts: allProducts.length, + inserted, + updated, + }; + } finally { + await browser.close(); + await pool.end(); + } +} + +// Run +const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted'; +const storeId = parseInt(process.argv[3] || '1', 10); + +console.log('='.repeat(60)); +console.log('DUTCHIE GRAPHQL FULL SCRAPE'); +console.log('='.repeat(60)); +console.log(`Menu URL: ${menuUrl}`); +console.log(`Store ID: ${storeId}`); +console.log(''); + +scrapeAllProducts(menuUrl, storeId) + .then((result) => { + console.log('\n' + '='.repeat(60)); + console.log('COMPLETE'); + console.log(JSON.stringify(result, null, 2)); + }) + .catch((error) => { + console.error('Error:', error.message); + process.exit(1); + }); diff --git a/backend/src/scripts/test-dutchie-e2e.ts b/backend/src/scripts/test-dutchie-e2e.ts new file mode 100644 index 00000000..183abfc9 --- /dev/null +++ b/backend/src/scripts/test-dutchie-e2e.ts @@ -0,0 +1,156 @@ +/** + * Test script: End-to-end Dutchie GraphQL → DB → Dashboard flow + * + * This demonstrates the complete data pipeline: + * 1. Fetch one product from Dutchie GraphQL via Puppeteer + * 2. Normalize it to our schema + * 3. Show the mapping + */ + +import { normalizeDutchieProduct, DutchieProduct, NormalizedProduct } from '../scrapers/dutchie-graphql'; +import * as fs from 'fs'; + +// Load the captured sample product from schema capture +const capturedData = JSON.parse( + fs.readFileSync('/tmp/dutchie-schema-capture.json', 'utf-8') +); + +const sampleProduct: DutchieProduct = capturedData.sampleProduct; + +console.log('='.repeat(80)); +console.log('DUTCHIE GRAPHQL → DATABASE MAPPING DEMONSTRATION'); +console.log('='.repeat(80)); + +console.log('\n📥 RAW DUTCHIE GRAPHQL PRODUCT:'); +console.log('-'.repeat(80)); + +// Show key fields from raw product +const keyRawFields = { + '_id': sampleProduct._id, + 'Name': sampleProduct.Name, + 'cName': sampleProduct.cName, + 'brandName': sampleProduct.brandName, + 'brand.id': sampleProduct.brand?.id, + 'type': sampleProduct.type, + 'subcategory': sampleProduct.subcategory, + 'strainType': sampleProduct.strainType, + 'Prices': sampleProduct.Prices, + 'recPrices': sampleProduct.recPrices, + 'recSpecialPrices': sampleProduct.recSpecialPrices, + 'special': sampleProduct.special, + 'specialData.saleSpecials[0].specialName': sampleProduct.specialData?.saleSpecials?.[0]?.specialName, + 'specialData.saleSpecials[0].discount': sampleProduct.specialData?.saleSpecials?.[0]?.discount, + 'THCContent.range[0]': sampleProduct.THCContent?.range?.[0], + 'CBDContent.range[0]': sampleProduct.CBDContent?.range?.[0], + 'Status': sampleProduct.Status, + 'Image': sampleProduct.Image, + 'POSMetaData.canonicalSKU': sampleProduct.POSMetaData?.canonicalSKU, + 'POSMetaData.children[0].quantity': sampleProduct.POSMetaData?.children?.[0]?.quantity, + 'POSMetaData.children[0].quantityAvailable': sampleProduct.POSMetaData?.children?.[0]?.quantityAvailable, +}; + +Object.entries(keyRawFields).forEach(([key, value]) => { + console.log(` ${key}: ${JSON.stringify(value)}`); +}); + +console.log('\n📤 NORMALIZED DATABASE ROW:'); +console.log('-'.repeat(80)); + +// Normalize the product +const normalized: NormalizedProduct = normalizeDutchieProduct(sampleProduct); + +// Show the normalized result (excluding raw_data for readability) +const { raw_data, cannabinoids, special_data, ...displayFields } = normalized; + +Object.entries(displayFields).forEach(([key, value]) => { + if (value !== undefined && value !== null) { + console.log(` ${key}: ${JSON.stringify(value)}`); + } +}); + +console.log('\n🔗 FIELD MAPPING:'); +console.log('-'.repeat(80)); + +const fieldMappings = [ + ['_id / id', 'external_id', sampleProduct._id, normalized.external_id], + ['Name', 'name', sampleProduct.Name, normalized.name], + ['cName', 'slug', sampleProduct.cName, normalized.slug], + ['brandName', 'brand', sampleProduct.brandName, normalized.brand], + ['brand.id', 'brand_external_id', sampleProduct.brand?.id, normalized.brand_external_id], + ['subcategory', 'subcategory', sampleProduct.subcategory, normalized.subcategory], + ['strainType', 'strain_type', sampleProduct.strainType, normalized.strain_type], + ['recPrices[0]', 'rec_price', sampleProduct.recPrices?.[0], normalized.rec_price], + ['recSpecialPrices[0]', 'rec_special_price', sampleProduct.recSpecialPrices?.[0], normalized.rec_special_price], + ['special', 'is_on_special', sampleProduct.special, normalized.is_on_special], + ['specialData...specialName', 'special_name', sampleProduct.specialData?.saleSpecials?.[0]?.specialName?.substring(0, 40) + '...', normalized.special_name?.substring(0, 40) + '...'], + ['THCContent.range[0]', 'thc_percentage', sampleProduct.THCContent?.range?.[0], normalized.thc_percentage], + ['CBDContent.range[0]', 'cbd_percentage', sampleProduct.CBDContent?.range?.[0], normalized.cbd_percentage], + ['Status', 'status', sampleProduct.Status, normalized.status], + ['Image', 'image_url', sampleProduct.Image?.substring(0, 50) + '...', normalized.image_url?.substring(0, 50) + '...'], + ['POSMetaData.canonicalSKU', 'sku', sampleProduct.POSMetaData?.canonicalSKU, normalized.sku], +]; + +console.log(' GraphQL Field → DB Column | Value'); +console.log(' ' + '-'.repeat(75)); + +fieldMappings.forEach(([gqlField, dbCol, gqlVal, dbVal]) => { + const gqlStr = String(gqlField).padEnd(30); + const dbStr = String(dbCol).padEnd(20); + console.log(` ${gqlStr} → ${dbStr} | ${JSON.stringify(dbVal)}`); +}); + +console.log('\n📊 SQL INSERT STATEMENT:'); +console.log('-'.repeat(80)); + +// Generate example SQL +const sqlExample = ` +INSERT INTO products ( + store_id, external_id, slug, name, + brand, brand_external_id, + subcategory, strain_type, + rec_price, rec_special_price, + is_on_special, special_name, discount_percent, + thc_percentage, cbd_percentage, + status, image_url, sku +) VALUES ( + 1, -- store_id (Deeply Rooted) + '${normalized.external_id}', -- external_id + '${normalized.slug}', -- slug + '${normalized.name}', -- name + '${normalized.brand}', -- brand + '${normalized.brand_external_id}', -- brand_external_id + '${normalized.subcategory}', -- subcategory + '${normalized.strain_type}', -- strain_type + ${normalized.rec_price}, -- rec_price + ${normalized.rec_special_price}, -- rec_special_price + ${normalized.is_on_special}, -- is_on_special + '${normalized.special_name?.substring(0, 50)}...', -- special_name + ${normalized.discount_percent || 'NULL'}, -- discount_percent + ${normalized.thc_percentage}, -- thc_percentage + ${normalized.cbd_percentage}, -- cbd_percentage + '${normalized.status}', -- status + '${normalized.image_url}', -- image_url + '${normalized.sku}' -- sku +) +ON CONFLICT (store_id, slug) DO UPDATE SET ...; +`; + +console.log(sqlExample); + +console.log('\n✅ SUMMARY:'); +console.log('-'.repeat(80)); +console.log(` Product: ${normalized.name}`); +console.log(` Brand: ${normalized.brand}`); +console.log(` Category: ${normalized.subcategory}`); +console.log(` Price: $${normalized.rec_price} → $${normalized.rec_special_price} (${normalized.discount_percent}% off)`); +console.log(` THC: ${normalized.thc_percentage}%`); +console.log(` Status: ${normalized.status}`); +console.log(` On Special: ${normalized.is_on_special}`); +console.log(` SKU: ${normalized.sku}`); + +console.log('\n🎯 DERIVED VIEWS (computed from products table):'); +console.log('-'.repeat(80)); +console.log(' - current_specials: Products where is_on_special = true'); +console.log(' - derived_brands: Aggregated by brand name with counts/prices'); +console.log(' - derived_categories: Aggregated by subcategory'); +console.log('\nAll views are computed from the single products table - no separate tables needed!'); diff --git a/backend/src/scripts/test-dutchie-graphql.ts b/backend/src/scripts/test-dutchie-graphql.ts new file mode 100644 index 00000000..a5fc1889 --- /dev/null +++ b/backend/src/scripts/test-dutchie-graphql.ts @@ -0,0 +1,233 @@ +/** + * Test script to validate Dutchie GraphQL API access and capture response structure + */ + +// @ts-ignore - node-fetch type declaration not installed +import fetch from 'node-fetch'; + +const GRAPHQL_HASHES = { + ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b', + GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b', + FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0', + MenuFiltersV2: '2f0b3233b8a2426b391649ca3f0f7a5d43b9aefd683f6286d7261a2517e3568e', + FilteredSpecials: '0dfb85a4fc138c55a076d4d11bf6d1a25f7cbd511428e1cf5a5b863b3eb23f25', +}; + +interface DutchieProduct { + id: string; + name: string; + slug?: string; + brand?: string; + brandId?: string; + type?: string; + category?: string; + subcategory?: string; + description?: string; + image?: string; + images?: string[]; + THCContent?: any; + CBDContent?: any; + terpenes?: any[]; + effects?: string[]; + strainType?: string; + weight?: string; + options?: any[]; + pricing?: any; + specialPricing?: any; + potencyThc?: any; + potencyCbd?: any; + labResults?: any; + [key: string]: any; // Catch-all for additional fields +} + +async function fetchProducts(dispensaryId: string, page = 0, perPage = 25): Promise { + const session = 'crawlsy-session-' + Date.now(); + + const variables = { + includeEnterpriseSpecials: false, + productsFilter: { + dispensaryId, + pricingType: 'rec', + Status: null, // null to include all (in-stock and out-of-stock) + types: [], + useCache: true, + isDefaultSort: true, + sortBy: 'popularSortIdx', + sortDirection: 1, + bypassOnlineThresholds: true, + isKioskMenu: false, + removeProductsBelowOptionThresholds: false + }, + page, + perPage + }; + + const qs = new URLSearchParams({ + operationName: 'FilteredProducts', + variables: JSON.stringify(variables), + extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.FilteredProducts } }) + }); + + const res = await fetch(`https://dutchie.com/api-3/graphql?${qs.toString()}`, { + headers: { + 'x-dutchie-session': session, + 'apollographql-client-name': 'Marketplace (production)', + 'content-type': 'application/json', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + } + }); + + if (!res.ok) { + const text = await res.text(); + console.error('HTTP Status:', res.status); + console.error('Response:', text.substring(0, 500)); + throw new Error(`HTTP ${res.status}: ${text.substring(0, 200)}`); + } + + return res.json(); +} + +async function resolveDispensaryId(cName: string): Promise { + const session = 'crawlsy-session-' + Date.now(); + + const variables = { input: { dispensaryId: cName } }; + + const qs = new URLSearchParams({ + operationName: 'GetAddressBasedDispensaryData', + variables: JSON.stringify(variables), + extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.GetAddressBasedDispensaryData } }) + }); + + const res = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, { + headers: { + 'x-dutchie-session': session, + 'apollographql-client-name': 'Marketplace (production)', + 'content-type': 'application/json', + 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + } + }); + + if (!res.ok) { + console.error('Failed to resolve dispensary ID:', res.status); + return null; + } + + const data: any = await res.json(); + return data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId || null; +} + +function enumerateFields(obj: any, prefix = ''): string[] { + const fields: string[] = []; + + for (const [key, value] of Object.entries(obj)) { + const path = prefix ? `${prefix}.${key}` : key; + + if (value === null) { + fields.push(`${path}: null`); + } else if (Array.isArray(value)) { + fields.push(`${path}: Array[${value.length}]`); + if (value.length > 0 && typeof value[0] === 'object') { + const subFields = enumerateFields(value[0], `${path}[0]`); + fields.push(...subFields); + } + } else if (typeof value === 'object') { + fields.push(`${path}: Object`); + const subFields = enumerateFields(value, path); + fields.push(...subFields); + } else { + const typeStr = typeof value; + const preview = String(value).substring(0, 50); + fields.push(`${path}: ${typeStr} = "${preview}"`); + } + } + + return fields; +} + +async function main() { + console.log('='.repeat(80)); + console.log('DUTCHIE GRAPHQL API TEST'); + console.log('='.repeat(80)); + + const cName = 'AZ-Deeply-Rooted'; + + // Step 1: Resolve dispensary ID + console.log(`\n1. Resolving dispensary ID for "${cName}"...`); + const dispensaryId = await resolveDispensaryId(cName); + + const finalDispensaryId = dispensaryId || '6405ef617056e8014d79101b'; // Fallback to known ID + if (!dispensaryId) { + console.log(' Failed to resolve via API, using hardcoded ID: 6405ef617056e8014d79101b'); + } + + console.log(` Final ID: ${finalDispensaryId}`); + + // Step 2: Fetch first page of products + console.log('\n2. Fetching products (page 0, perPage 5)...'); + const result = await fetchProducts(finalDispensaryId, 0, 5); + + if (result.errors) { + console.error('\nGraphQL Errors:'); + console.error(JSON.stringify(result.errors, null, 2)); + return; + } + + const products = result?.data?.filteredProducts?.products || []; + console.log(` Found ${products.length} products in this page`); + + if (products.length === 0) { + console.log('No products returned. Full response:'); + console.log(JSON.stringify(result, null, 2)); + return; + } + + // Step 3: Enumerate all fields from first product + console.log('\n3. PRODUCT FIELD STRUCTURE (from first product):'); + console.log('-'.repeat(80)); + + const product = products[0]; + const fields = enumerateFields(product); + fields.forEach(f => console.log(` ${f}`)); + + // Step 4: Show full sample product JSON + console.log('\n4. FULL SAMPLE PRODUCT JSON:'); + console.log('-'.repeat(80)); + console.log(JSON.stringify(product, null, 2)); + + // Step 5: Summary of key fields for schema design + console.log('\n5. KEY FIELDS FOR SCHEMA DESIGN:'); + console.log('-'.repeat(80)); + + const keyFields = [ + { field: 'id', value: product.id }, + { field: 'name', value: product.name }, + { field: 'slug', value: product.slug }, + { field: 'brand', value: product.brand }, + { field: 'brandId', value: product.brandId }, + { field: 'type', value: product.type }, + { field: 'category', value: product.category }, + { field: 'subcategory', value: product.subcategory }, + { field: 'strainType', value: product.strainType }, + { field: 'THCContent', value: product.THCContent }, + { field: 'CBDContent', value: product.CBDContent }, + { field: 'description', value: product.description?.substring(0, 100) + '...' }, + { field: 'image', value: product.image }, + { field: 'options.length', value: product.options?.length }, + { field: 'pricing', value: product.pricing }, + { field: 'terpenes.length', value: product.terpenes?.length }, + { field: 'effects.length', value: product.effects?.length }, + ]; + + keyFields.forEach(({ field, value }) => { + console.log(` ${field}: ${JSON.stringify(value)}`); + }); + + // Step 6: Show an option (variant) if available + if (product.options && product.options.length > 0) { + console.log('\n6. SAMPLE OPTION/VARIANT:'); + console.log('-'.repeat(80)); + console.log(JSON.stringify(product.options[0], null, 2)); + } +} + +main().catch(console.error); diff --git a/backend/src/scripts/test-status-filter.ts b/backend/src/scripts/test-status-filter.ts new file mode 100644 index 00000000..6bb11026 --- /dev/null +++ b/backend/src/scripts/test-status-filter.ts @@ -0,0 +1,106 @@ +/** + * Test different Status filter values in Dutchie GraphQL + */ + +import puppeteer from 'puppeteer-extra'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; + +puppeteer.use(StealthPlugin()); + +const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0'; + +async function main() { + const browser = await puppeteer.launch({ + headless: 'new', + args: ['--no-sandbox', '--disable-setuid-sandbox'], + }); + + const page = await browser.newPage(); + await page.setUserAgent( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36' + ); + + console.log('Loading menu...'); + await page.goto('https://dutchie.com/embedded-menu/AZ-Deeply-Rooted', { + waitUntil: 'networkidle2', + timeout: 60000, + }); + await new Promise((r) => setTimeout(r, 3000)); + + const dispensaryId = await page.evaluate(() => (window as any).reactEnv?.dispensaryId); + console.log('Dispensary ID:', dispensaryId); + + // Test different status values + const testCases = [ + { label: 'Active', status: 'Active', includeStatus: true }, + { label: 'Inactive', status: 'Inactive', includeStatus: true }, + { label: 'null', status: null, includeStatus: true }, + { label: 'omitted', status: null, includeStatus: false }, + ]; + + for (const testCase of testCases) { + const result = await page.evaluate( + async (dispId: string, hash: string, status: string | null, includeStatus: boolean) => { + const filter: any = { + dispensaryId: dispId, + pricingType: 'rec', + types: [], + useCache: false, + isDefaultSort: true, + sortBy: 'popularSortIdx', + sortDirection: 1, + bypassOnlineThresholds: true, + isKioskMenu: false, + removeProductsBelowOptionThresholds: false, + }; + + if (includeStatus) { + filter.Status = status; + } + + const variables = { + includeEnterpriseSpecials: false, + productsFilter: filter, + page: 0, + perPage: 100, + }; + + const qs = new URLSearchParams({ + operationName: 'FilteredProducts', + variables: JSON.stringify(variables), + extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }), + }); + + const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, { + method: 'GET', + headers: { + 'content-type': 'application/json', + 'apollographql-client-name': 'Marketplace (production)', + }, + credentials: 'include', + }); + + const json = await resp.json(); + const products = json?.data?.filteredProducts?.products || []; + return { + count: products.length, + totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount, + sampleStatus: products[0]?.Status, + statuses: [...new Set(products.map((p: any) => p.Status))], + }; + }, + dispensaryId, + GRAPHQL_HASH, + testCase.status, + testCase.includeStatus + ); + + console.log( + `Status ${testCase.label}: Products=${result.count}, Total=${result.totalCount}, Statuses=${JSON.stringify(result.statuses)}` + ); + } + + await browser.close(); +} + +main().catch(console.error); diff --git a/backend/src/services/store-crawl-orchestrator.ts b/backend/src/services/store-crawl-orchestrator.ts index b366248b..5e7bdd28 100644 --- a/backend/src/services/store-crawl-orchestrator.ts +++ b/backend/src/services/store-crawl-orchestrator.ts @@ -20,7 +20,13 @@ import { MultiCategoryDetectionResult, } from './intelligence-detector'; import { runCrawlProductsJob, runSandboxProductsJob } from './category-crawler-jobs'; -import { scrapeStore } from '../scraper-v2'; +// DEPRECATED: scrapeStore writes to legacy products table +// import { scrapeStore } from '../scraper-v2'; + +// Import the new dutchie-az pipeline for Dutchie crawling +import { crawlDispensaryProducts } from '../dutchie-az/services/product-crawler'; +import { query as dutchieAzQuery } from '../dutchie-az/db/connection'; +import { Dispensary as DutchieAzDispensary } from '../dutchie-az/types'; // ======================================== // Types @@ -159,39 +165,61 @@ export async function runStoreCrawlOrchestrator(storeId: number): Promise( + `SELECT * FROM dispensaries + WHERE name ILIKE $1 + OR slug ILIKE $2 + LIMIT 1`, + [store.dispensary_name, store.slug] + ); - // Get crawl stats from the latest job - const stats = await getLatestCrawlStats(storeId); + if (dispensaryResult.rows.length === 0) { + throw new Error( + `Dispensary not found in dutchie-az database. ` + + `You must add this dispensary to the dutchie-az pipeline first. ` + + `Store: ${store.name} (${store.dispensary_name})` + ); + } + + const dutchieDispensary = dispensaryResult.rows[0]; + + // Run the new dutchie-az GraphQL crawler + const crawlResult = await crawlDispensaryProducts(dutchieDispensary, 'rec', { useBothModes: true }); result.crawlRan = true; result.crawlType = 'production'; - result.productsFound = stats.products_found ?? undefined; - result.productsNew = stats.products_new ?? undefined; - result.productsUpdated = stats.products_updated ?? undefined; + result.productsFound = crawlResult.productsFound ?? undefined; + result.productsNew = crawlResult.productsUpserted ?? undefined; + result.productsUpdated = crawlResult.snapshotsCreated ?? undefined; - const detectionPart = result.detectionRan ? 'Detection + ' : ''; - result.summary = `${detectionPart}Dutchie products crawl (${stats.products_found || 0} items, ${stats.products_new || 0} new, ${stats.products_updated || 0} updated)`; - result.status = 'success'; + if (crawlResult.success) { + const detectionPart = result.detectionRan ? 'Detection + ' : ''; + result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`; + result.status = 'success'; - // Update store's last_scraped_at - await pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]); + // Update store's last_scraped_at + await pool.query('UPDATE stores SET last_scraped_at = NOW() WHERE id = $1', [storeId]); - crawlerLogger.jobCompleted({ - job_id: 0, // Orchestrator doesn't create traditional jobs - store_id: storeId, - store_name: store.name, - duration_ms: Date.now() - startTime, - products_found: stats.products_found || 0, - products_new: stats.products_new || 0, - products_updated: stats.products_updated || 0, - provider: 'dutchie', - }); + crawlerLogger.jobCompleted({ + job_id: 0, // Orchestrator doesn't create traditional jobs + store_id: storeId, + store_name: store.name, + duration_ms: crawlResult.durationMs, + products_found: crawlResult.productsFound || 0, + products_new: crawlResult.productsUpserted || 0, + products_updated: crawlResult.snapshotsCreated || 0, + provider: 'dutchie', + }); + } else { + throw new Error(crawlResult.errorMessage || 'Crawl failed'); + } } catch (crawlError: any) { result.status = 'error'; diff --git a/backend/src/utils/image-storage.ts b/backend/src/utils/image-storage.ts new file mode 100644 index 00000000..cabb4f6a --- /dev/null +++ b/backend/src/utils/image-storage.ts @@ -0,0 +1,322 @@ +/** + * Local Image Storage Utility + * + * Downloads and stores product images to local filesystem. + * Replaces MinIO-based storage with simple local file storage. + * + * Directory structure: + * /images/products//.webp + * /images/products//-thumb.webp + * /images/products//-medium.webp + * /images/brands/.webp + */ + +import axios from 'axios'; +import sharp from 'sharp'; +import * as fs from 'fs/promises'; +import * as path from 'path'; +import { createHash } from 'crypto'; + +// Base path for image storage - configurable via env +const IMAGES_BASE_PATH = process.env.IMAGES_PATH || '/app/public/images'; + +// Public URL base for serving images +const IMAGES_PUBLIC_URL = process.env.IMAGES_PUBLIC_URL || '/images'; + +export interface LocalImageSizes { + full: string; // URL path: /images/products/123/456.webp + medium: string; // URL path: /images/products/123/456-medium.webp + thumb: string; // URL path: /images/products/123/456-thumb.webp +} + +export interface DownloadResult { + success: boolean; + urls?: LocalImageSizes; + error?: string; + bytesDownloaded?: number; +} + +/** + * Ensure a directory exists + */ +async function ensureDir(dirPath: string): Promise { + try { + await fs.mkdir(dirPath, { recursive: true }); + } catch (error: any) { + if (error.code !== 'EEXIST') throw error; + } +} + +/** + * Generate a short hash from a URL for deduplication + */ +function hashUrl(url: string): string { + return createHash('md5').update(url).digest('hex').substring(0, 8); +} + +/** + * Download an image from a URL and return the buffer + */ +async function downloadImage(imageUrl: string): Promise { + const response = await axios.get(imageUrl, { + responseType: 'arraybuffer', + timeout: 30000, + headers: { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', + }, + }); + return Buffer.from(response.data); +} + +/** + * Process and save image in multiple sizes + * Returns the file paths relative to IMAGES_BASE_PATH + */ +async function processAndSaveImage( + buffer: Buffer, + outputDir: string, + baseFilename: string +): Promise<{ full: string; medium: string; thumb: string; totalBytes: number }> { + await ensureDir(outputDir); + + const fullPath = path.join(outputDir, `${baseFilename}.webp`); + const mediumPath = path.join(outputDir, `${baseFilename}-medium.webp`); + const thumbPath = path.join(outputDir, `${baseFilename}-thumb.webp`); + + // Process images in parallel + const [fullBuffer, mediumBuffer, thumbBuffer] = await Promise.all([ + // Full: max 1200x1200, high quality + sharp(buffer) + .resize(1200, 1200, { fit: 'inside', withoutEnlargement: true }) + .webp({ quality: 85 }) + .toBuffer(), + // Medium: 600x600 + sharp(buffer) + .resize(600, 600, { fit: 'inside', withoutEnlargement: true }) + .webp({ quality: 80 }) + .toBuffer(), + // Thumb: 200x200 + sharp(buffer) + .resize(200, 200, { fit: 'inside', withoutEnlargement: true }) + .webp({ quality: 75 }) + .toBuffer(), + ]); + + // Save all sizes + await Promise.all([ + fs.writeFile(fullPath, fullBuffer), + fs.writeFile(mediumPath, mediumBuffer), + fs.writeFile(thumbPath, thumbBuffer), + ]); + + const totalBytes = fullBuffer.length + mediumBuffer.length + thumbBuffer.length; + + return { + full: fullPath, + medium: mediumPath, + thumb: thumbPath, + totalBytes, + }; +} + +/** + * Convert a file path to a public URL + */ +function pathToUrl(filePath: string): string { + const relativePath = filePath.replace(IMAGES_BASE_PATH, ''); + return `${IMAGES_PUBLIC_URL}${relativePath}`; +} + +/** + * Download and store a product image locally + * + * @param imageUrl - The third-party image URL to download + * @param dispensaryId - The dispensary ID (for directory organization) + * @param productId - The product ID or external ID (for filename) + * @returns Download result with local URLs + */ +export async function downloadProductImage( + imageUrl: string, + dispensaryId: number, + productId: string | number +): Promise { + try { + if (!imageUrl) { + return { success: false, error: 'No image URL provided' }; + } + + // Download the image + const buffer = await downloadImage(imageUrl); + + // Organize by dispensary ID + const outputDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId)); + + // Use product ID + URL hash for uniqueness + const urlHash = hashUrl(imageUrl); + const baseFilename = `${productId}-${urlHash}`; + + // Process and save + const result = await processAndSaveImage(buffer, outputDir, baseFilename); + + return { + success: true, + urls: { + full: pathToUrl(result.full), + medium: pathToUrl(result.medium), + thumb: pathToUrl(result.thumb), + }, + bytesDownloaded: result.totalBytes, + }; + } catch (error: any) { + return { + success: false, + error: error.message || 'Failed to download image', + }; + } +} + +/** + * Download and store a brand logo locally + * + * @param logoUrl - The brand logo URL + * @param brandId - The brand ID or slug + * @returns Download result with local URL + */ +export async function downloadBrandLogo( + logoUrl: string, + brandId: string +): Promise { + try { + if (!logoUrl) { + return { success: false, error: 'No logo URL provided' }; + } + + // Download the image + const buffer = await downloadImage(logoUrl); + + // Brand logos go in /images/brands/ + const outputDir = path.join(IMAGES_BASE_PATH, 'brands'); + + // Sanitize brand ID for filename + const safeBrandId = brandId.replace(/[^a-zA-Z0-9-_]/g, '_'); + const urlHash = hashUrl(logoUrl); + const baseFilename = `${safeBrandId}-${urlHash}`; + + // Process and save (single size for logos) + await ensureDir(outputDir); + const logoPath = path.join(outputDir, `${baseFilename}.webp`); + + const logoBuffer = await sharp(buffer) + .resize(400, 400, { fit: 'inside', withoutEnlargement: true }) + .webp({ quality: 85 }) + .toBuffer(); + + await fs.writeFile(logoPath, logoBuffer); + + return { + success: true, + urls: { + full: pathToUrl(logoPath), + medium: pathToUrl(logoPath), + thumb: pathToUrl(logoPath), + }, + bytesDownloaded: logoBuffer.length, + }; + } catch (error: any) { + return { + success: false, + error: error.message || 'Failed to download brand logo', + }; + } +} + +/** + * Check if a local image already exists + */ +export async function imageExists( + dispensaryId: number, + productId: string | number, + imageUrl: string +): Promise { + const urlHash = hashUrl(imageUrl); + const imagePath = path.join( + IMAGES_BASE_PATH, + 'products', + String(dispensaryId), + `${productId}-${urlHash}.webp` + ); + try { + await fs.access(imagePath); + return true; + } catch { + return false; + } +} + +/** + * Delete a product's local images + */ +export async function deleteProductImages( + dispensaryId: number, + productId: string | number, + imageUrl?: string +): Promise { + const productDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId)); + const prefix = imageUrl + ? `${productId}-${hashUrl(imageUrl)}` + : String(productId); + + try { + const files = await fs.readdir(productDir); + const toDelete = files.filter(f => f.startsWith(prefix)); + await Promise.all(toDelete.map(f => fs.unlink(path.join(productDir, f)))); + } catch { + // Directory might not exist, that's fine + } +} + +/** + * Initialize the image storage directories + */ +export async function initializeImageStorage(): Promise { + await ensureDir(path.join(IMAGES_BASE_PATH, 'products')); + await ensureDir(path.join(IMAGES_BASE_PATH, 'brands')); + console.log(`✅ Image storage initialized at ${IMAGES_BASE_PATH}`); +} + +/** + * Get storage stats + */ +export async function getStorageStats(): Promise<{ + productsDir: string; + brandsDir: string; + productCount: number; + brandCount: number; +}> { + const productsDir = path.join(IMAGES_BASE_PATH, 'products'); + const brandsDir = path.join(IMAGES_BASE_PATH, 'brands'); + + let productCount = 0; + let brandCount = 0; + + try { + const productDirs = await fs.readdir(productsDir); + for (const dir of productDirs) { + const files = await fs.readdir(path.join(productsDir, dir)); + productCount += files.filter(f => f.endsWith('.webp') && !f.includes('-')).length; + } + } catch { /* ignore */ } + + try { + const brandFiles = await fs.readdir(brandsDir); + brandCount = brandFiles.filter(f => f.endsWith('.webp')).length; + } catch { /* ignore */ } + + return { + productsDir, + brandsDir, + productCount, + brandCount, + }; +} diff --git a/backend/src/utils/product-normalizer.ts b/backend/src/utils/product-normalizer.ts new file mode 100644 index 00000000..6961014d --- /dev/null +++ b/backend/src/utils/product-normalizer.ts @@ -0,0 +1,206 @@ +/** + * Product Normalizer Utility + * + * Functions for normalizing product data to enable consistent matching + * and prevent duplicate product entries. + */ + +/** + * Normalize product name for matching + * - Lowercase + * - Remove punctuation + * - Remove THC/CBD percentages often appended to names + * - Remove weight suffixes + * - Remove emoji + * - Normalize whitespace + */ +export function normalizeProductName(name: string): string { + if (!name) return ''; + + return name + .toLowerCase() + .trim() + // Remove special characters except alphanumeric and spaces + .replace(/[^\w\s]/g, ' ') + // Remove common suffixes like THC/CBD percentages appended to names + .replace(/\s*(thc|cbd|cbg|cbn|tac)\s*[:=]?\s*[\d.]+\s*%?/gi, '') + // Remove weight/size suffixes often appended + .replace(/\s*\d+(\.\d+)?\s*(mg|g|oz|ml|gram|grams|ounce|ounces)\b/gi, '') + // Remove emoji + .replace(/[\u{1F300}-\u{1F9FF}]/gu, '') + // Remove "special offer" type suffixes + .replace(/\s*special\s*offer\s*/gi, '') + // Normalize multiple spaces to single space + .replace(/\s+/g, ' ') + .trim(); +} + +/** + * Normalize brand name for matching + */ +export function normalizeBrandName(brand: string | null | undefined): string { + if (!brand) return ''; + + return brand + .toLowerCase() + .trim() + // Remove special characters + .replace(/[^\w\s]/g, ' ') + // Normalize whitespace + .replace(/\s+/g, ' ') + .trim(); +} + +/** + * Normalize weight string to standard format + * e.g., "3.5 grams" -> "3.5g", "1/8 oz" -> "3.5g" + */ +export function normalizeWeight(weight: string | null | undefined): string { + if (!weight) return ''; + + const w = weight.toLowerCase().trim(); + + // Handle fractional ounces + if (w.includes('1/8') || w.includes('eighth')) { + return '3.5g'; + } + if (w.includes('1/4') || w.includes('quarter')) { + return '7g'; + } + if (w.includes('1/2') || w.includes('half')) { + return '14g'; + } + if (w.includes('1 oz') || w === 'oz' || w === '1oz') { + return '28g'; + } + + // Extract numeric value and unit + const match = w.match(/([\d.]+)\s*(mg|g|oz|ml|gram|grams?|ounce|ounces?)?/i); + if (!match) return w; + + const value = parseFloat(match[1]); + let unit = (match[2] || 'g').toLowerCase(); + + // Normalize unit names + unit = unit.replace(/gram(s)?/, 'g').replace(/ounce(s)?/, 'oz'); + + // Convert oz to grams for consistency + if (unit === 'oz') { + return `${(value * 28).toFixed(1)}g`; + } + + return `${value}${unit}`; +} + +/** + * Generate a matching fingerprint for a product + * Used for deduplication + */ +export function generateProductFingerprint( + name: string, + brand: string | null | undefined, + weight: string | null | undefined, + categoryId: number | null | undefined +): string { + const parts = [ + normalizeProductName(name), + normalizeBrandName(brand), + normalizeWeight(weight), + categoryId?.toString() || '' + ]; + + return parts.filter(Boolean).join('|'); +} + +/** + * Calculate similarity between two strings (0-100) + * Uses Levenshtein distance + */ +export function stringSimilarity(str1: string, str2: string): number { + if (str1 === str2) return 100; + if (!str1 || !str2) return 0; + + const s1 = str1.toLowerCase(); + const s2 = str2.toLowerCase(); + + if (s1 === s2) return 100; + + const longer = s1.length > s2.length ? s1 : s2; + const shorter = s1.length > s2.length ? s2 : s1; + + const longerLength = longer.length; + if (longerLength === 0) return 100; + + const distance = levenshteinDistance(longer, shorter); + return Math.round(((longerLength - distance) / longerLength) * 100); +} + +/** + * Levenshtein distance between two strings + */ +function levenshteinDistance(str1: string, str2: string): number { + const m = str1.length; + const n = str2.length; + + // Create distance matrix + const dp: number[][] = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0)); + + // Initialize first row and column + for (let i = 0; i <= m; i++) dp[i][0] = i; + for (let j = 0; j <= n; j++) dp[0][j] = j; + + // Fill in the rest + for (let i = 1; i <= m; i++) { + for (let j = 1; j <= n; j++) { + const cost = str1[i - 1] === str2[j - 1] ? 0 : 1; + dp[i][j] = Math.min( + dp[i - 1][j] + 1, // deletion + dp[i][j - 1] + 1, // insertion + dp[i - 1][j - 1] + cost // substitution + ); + } + } + + return dp[m][n]; +} + +/** + * Check if two products are likely the same + * Returns confidence score (0-100) + */ +export function areProductsSimilar( + product1: { name: string; brand?: string | null; weight?: string | null }, + product2: { name: string; brand?: string | null; weight?: string | null }, + threshold: number = 92 +): { isSimilar: boolean; confidence: number } { + const name1 = normalizeProductName(product1.name); + const name2 = normalizeProductName(product2.name); + + const nameSimilarity = stringSimilarity(name1, name2); + + // If names are very similar, likely same product + if (nameSimilarity >= threshold) { + return { isSimilar: true, confidence: nameSimilarity }; + } + + // Check brand match for additional confidence + const brand1 = normalizeBrandName(product1.brand); + const brand2 = normalizeBrandName(product2.brand); + + if (brand1 && brand2 && brand1 === brand2) { + // Same brand, lower threshold for name match + if (nameSimilarity >= threshold - 10) { + return { isSimilar: true, confidence: nameSimilarity + 5 }; + } + } + + // Check weight match + const weight1 = normalizeWeight(product1.weight); + const weight2 = normalizeWeight(product2.weight); + + if (weight1 && weight2 && weight1 === weight2 && nameSimilarity >= threshold - 15) { + return { isSimilar: true, confidence: nameSimilarity + 3 }; + } + + return { isSimilar: false, confidence: nameSimilarity }; +} diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index d8d595f0..701e6527 100755 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -456,8 +456,12 @@ class ApiClient { } // Dispensary Schedule (new dispensary-centric API) - async getDispensarySchedules() { - return this.request<{ dispensaries: any[] }>('/api/schedule/dispensaries'); + async getDispensarySchedules(filters?: { state?: string; search?: string }) { + const params = new URLSearchParams(); + if (filters?.state) params.append('state', filters.state); + if (filters?.search) params.append('search', filters.search); + const queryString = params.toString(); + return this.request<{ dispensaries: any[] }>(`/api/schedule/dispensaries${queryString ? `?${queryString}` : ''}`); } async getDispensarySchedule(dispensaryId: number) { @@ -482,6 +486,63 @@ class ApiClient { }); } + async resolvePlatformId(dispensaryId: number) { + return this.request<{ + success: boolean; + platform_dispensary_id?: string; + slug_resolved?: string; + message: string; + already_resolved?: boolean; + error?: string; + }>(`/api/schedule/dispensaries/${dispensaryId}/resolve-platform-id`, { + method: 'POST', + }); + } + + async detectMenuType(dispensaryId: number) { + return this.request<{ + success: boolean; + menu_type: string; + url_checked: string; + message: string; + }>(`/api/schedule/dispensaries/${dispensaryId}/detect-menu-type`, { + method: 'POST', + }); + } + + async refreshDetection(dispensaryId: number) { + return this.request<{ + success: boolean; + menu_type: string; + platform_dispensary_id: string | null; + url_checked: string; + can_crawl: boolean; + }>(`/api/schedule/dispensaries/${dispensaryId}/refresh-detection`, { + method: 'POST', + }); + } + + async toggleDispensarySchedule(dispensaryId: number, isActive: boolean) { + return this.request<{ + success: boolean; + schedule: any; + message: string; + }>(`/api/schedule/dispensaries/${dispensaryId}/toggle-active`, { + method: 'PUT', + body: JSON.stringify({ is_active: isActive }), + }); + } + + async deleteDispensarySchedule(dispensaryId: number) { + return this.request<{ + success: boolean; + deleted: boolean; + message: string; + }>(`/api/schedule/dispensaries/${dispensaryId}/schedule`, { + method: 'DELETE', + }); + } + async getCrawlJobs(limit?: number) { const params = limit ? `?limit=${limit}` : ''; return this.request<{ jobs: any[] }>(`/api/schedule/jobs${params}`); diff --git a/frontend/src/pages/ScraperSchedule.tsx b/frontend/src/pages/ScraperSchedule.tsx index 307ae832..8ded3889 100644 --- a/frontend/src/pages/ScraperSchedule.tsx +++ b/frontend/src/pages/ScraperSchedule.tsx @@ -18,21 +18,27 @@ interface DispensarySchedule { dispensary_name: string; city: string | null; state: string | null; + dispensary_slug: string | null; slug: string | null; website: string | null; menu_url: string | null; + menu_type: string | null; + platform_dispensary_id: string | null; product_provider: string | null; provider_type: string | null; product_confidence: number | null; product_crawler_mode: string | null; last_product_scan_at: string | null; + is_active: boolean; schedule_active: boolean; - interval_minutes: number; + interval_minutes: number | null; priority: number; last_run_at: string | null; next_run_at: string | null; + schedule_last_status: string | null; last_status: string | null; last_summary: string | null; + schedule_last_error: string | null; last_error: string | null; consecutive_failures: number | null; total_runs: number | null; @@ -42,6 +48,9 @@ interface DispensarySchedule { latest_job_status: string | null; latest_job_started: string | null; latest_products_found: number | null; + // Computed from view + can_crawl: boolean; + schedule_status_reason: string | null; } interface CrawlJob { @@ -69,6 +78,21 @@ export function ScraperSchedule() { const [autoRefresh, setAutoRefresh] = useState(true); const [activeTab, setActiveTab] = useState<'dispensaries' | 'jobs' | 'global'>('dispensaries'); const [triggeringDispensary, setTriggeringDispensary] = useState(null); + const [resolvingId, setResolvingId] = useState(null); + const [refreshingDetection, setRefreshingDetection] = useState(null); + const [togglingSchedule, setTogglingSchedule] = useState(null); + const [filterDutchieOnly, setFilterDutchieOnly] = useState(false); + const [stateFilter, setStateFilter] = useState<'all' | 'AZ'>('all'); + const [searchTerm, setSearchTerm] = useState(''); + const [searchInput, setSearchInput] = useState(''); // For debouncing + + // Debounce search input + useEffect(() => { + const timer = setTimeout(() => { + setSearchTerm(searchInput); + }, 300); + return () => clearTimeout(timer); + }, [searchInput]); useEffect(() => { loadData(); @@ -77,13 +101,22 @@ export function ScraperSchedule() { const interval = setInterval(loadData, 5000); return () => clearInterval(interval); } - }, [autoRefresh]); + }, [autoRefresh, stateFilter, searchTerm]); const loadData = async () => { try { + // Build filters for dispensary schedules + const filters: { state?: string; search?: string } = {}; + if (stateFilter === 'AZ') { + filters.state = 'AZ'; + } + if (searchTerm.trim()) { + filters.search = searchTerm.trim(); + } + const [globalData, dispensaryData, jobsData] = await Promise.all([ api.getGlobalSchedule(), - api.getDispensarySchedules(), + api.getDispensarySchedules(Object.keys(filters).length > 0 ? filters : undefined), api.getDispensaryCrawlJobs(100) ]); @@ -129,6 +162,62 @@ export function ScraperSchedule() { } }; + const handleResolvePlatformId = async (dispensaryId: number) => { + setResolvingId(dispensaryId); + try { + const result = await api.resolvePlatformId(dispensaryId); + if (result.success) { + alert(result.message); + } else { + alert(`Failed: ${result.error || result.message}`); + } + await loadData(); + } catch (error: any) { + console.error('Failed to resolve platform ID:', error); + alert(`Error: ${error.message}`); + } finally { + setResolvingId(null); + } + }; + + const handleRefreshDetection = async (dispensaryId: number) => { + setRefreshingDetection(dispensaryId); + try { + const result = await api.refreshDetection(dispensaryId); + alert(`Detected: ${result.menu_type}${result.platform_dispensary_id ? `, Platform ID: ${result.platform_dispensary_id}` : ''}`); + await loadData(); + } catch (error: any) { + console.error('Failed to refresh detection:', error); + alert(`Error: ${error.message}`); + } finally { + setRefreshingDetection(null); + } + }; + + const handleToggleSchedule = async (dispensaryId: number, currentActive: boolean) => { + setTogglingSchedule(dispensaryId); + try { + await api.toggleDispensarySchedule(dispensaryId, !currentActive); + await loadData(); + } catch (error: any) { + console.error('Failed to toggle schedule:', error); + alert(`Error: ${error.message}`); + } finally { + setTogglingSchedule(null); + } + }; + + const handleDeleteSchedule = async (dispensaryId: number) => { + if (!confirm('Are you sure you want to delete this schedule?')) return; + try { + await api.deleteDispensarySchedule(dispensaryId); + await loadData(); + } catch (error: any) { + console.error('Failed to delete schedule:', error); + alert(`Error: ${error.message}`); + } + }; + const handleUpdateGlobalSchedule = async (type: string, data: any) => { try { await api.updateGlobalSchedule(type, data); @@ -373,32 +462,127 @@ export function ScraperSchedule() { )} {activeTab === 'dispensaries' && ( -
- +
+ {/* Filter Bar */} +
+ {/* State Filter Toggle */} +
+ State: +
+ + +
+
+ + {/* Search Box */} +
+ Search: + setSearchInput(e.target.value)} + style={{ + padding: '6px 12px', + borderRadius: '6px', + border: '1px solid #d1d5db', + fontSize: '14px', + width: '200px' + }} + /> + {searchInput && ( + + )} +
+ + {/* Dutchie Only Checkbox */} + + + {/* Results Count */} + + Showing {(filterDutchieOnly + ? dispensarySchedules.filter(d => d.menu_type === 'dutchie') + : dispensarySchedules + ).length} dispensaries + +
+
+
- - - - - - - + + + + + + + + - {dispensarySchedules.map((disp) => ( + {(filterDutchieOnly + ? dispensarySchedules.filter(d => d.menu_type === 'dutchie') + : dispensarySchedules + ).map((disp) => ( - - - + {/* Status Column - Shows can_crawl and reason */} + - ))}
DispensaryProviderScheduleLast RunNext RunLast ResultActionsDispensaryMenu TypePlatform IDStatusLast RunNext RunLast ResultActions
+
- {disp.state && disp.city && disp.slug ? ( + {disp.state && disp.city && (disp.dispensary_slug || disp.slug) ? ( {disp.dispensary_name} )}
-
+
{disp.city ? `${disp.city}, ${disp.state}` : disp.state}
- {(disp.product_provider || disp.provider_type) && disp.product_provider !== 'unknown' && disp.provider_type !== 'unknown' ? ( -
- - {disp.product_provider || disp.provider_type} - - {disp.product_crawler_mode !== 'production' && ( -
sandbox
- )} -
- ) : disp.menu_url ? ( + {/* Menu Type Column */} +
+ {disp.menu_type ? ( - Pending + {disp.menu_type} ) : ( - - + unknown )} + {/* Platform ID Column */} + + {disp.platform_dispensary_id ? ( + + {disp.platform_dispensary_id.length > 12 + ? `${disp.platform_dispensary_id.slice(0, 6)}...${disp.platform_dispensary_id.slice(-4)}` + : disp.platform_dispensary_id} + + ) : ( + + missing + + )} +
- {disp.schedule_active ? 'Active' : 'Disabled'} - - - Every {Math.round(disp.interval_minutes / 60)}h + {disp.can_crawl ? 'Ready' : (disp.is_active !== false ? 'Not Ready' : 'Disabled')} + {disp.schedule_status_reason && disp.schedule_status_reason !== 'ready' && ( + + {disp.schedule_status_reason} + + )} + {disp.interval_minutes && ( + + Every {Math.round(disp.interval_minutes / 60)}h + + )}
@@ -530,28 +734,91 @@ export function ScraperSchedule() { No runs yet )} - + +
+ {/* Refresh Detection - always available */} + + + {/* Resolve ID - only if dutchie and missing platform ID */} + {disp.menu_type === 'dutchie' && !disp.platform_dispensary_id && ( + + )} + + {/* Run Now - only if can_crawl */} + + + {/* Enable/Disable Schedule Toggle */} + +
+ )} {activeTab === 'jobs' && (