Add CLAUDE guidelines for consolidated pipeline
This commit is contained in:
40
CLAUDE.md
Normal file
40
CLAUDE.md
Normal file
@@ -0,0 +1,40 @@
|
||||
## Claude Guidelines for this Project
|
||||
|
||||
1) **Use the consolidated DB everywhere**
|
||||
- Preferred env: `CRAWLSY_DATABASE_URL` (fallback `DATABASE_URL`).
|
||||
- Do NOT create dutchie tables in the legacy DB. Apply migrations 031/032/033 to the consolidated DB and restart.
|
||||
|
||||
2) **Dispensary vs Store**
|
||||
- Dutchie pipeline uses `dispensaries` (not legacy `stores`). For dutchie crawls, always work with dispensary ID.
|
||||
- Ignore legacy fields like `dutchie_plus_id` and slug guessing. Use the record’s `menu_url` and `platform_dispensary_id`.
|
||||
|
||||
3) **Menu detection and platform IDs**
|
||||
- Set `menu_type` from `menu_url` detection; resolve `platform_dispensary_id` for `menu_type='dutchie'`.
|
||||
- Admin should have “refresh detection” and “resolve ID” actions; schedule/crawl only when `menu_type='dutchie'` AND `platform_dispensary_id` is set.
|
||||
|
||||
4) **Queries and mapping**
|
||||
- The DB returns snake_case; code expects camelCase. Always alias/map:
|
||||
- `platform_dispensary_id AS "platformDispensaryId"`
|
||||
- Map via `mapDbRowToDispensary` when loading dispensaries (scheduler, crawler, admin crawl).
|
||||
- Avoid `SELECT *`; explicitly select and/or map fields.
|
||||
|
||||
5) **Scheduling**
|
||||
- `/scraper-schedule` should accept filters/search (All vs AZ-only, name).
|
||||
- “Run Now”/scheduler must skip or warn if `menu_type!='dutchie'` or `platform_dispensary_id` missing.
|
||||
- Use `dispensary_crawl_status` view; show reason when not crawlable.
|
||||
|
||||
6) **Crawling**
|
||||
- Trigger dutchie crawls by dispensary ID (e.g., `/api/az/admin/crawl/:id` or `runDispensaryOrchestrator(id)`).
|
||||
- Update existing products (by stable product ID), append snapshots for history (every 4h cadence), download images locally (`/images/...`), store local URLs.
|
||||
- Use dutchie GraphQL pipeline only for `menu_type='dutchie'`.
|
||||
|
||||
7) **Frontend**
|
||||
- Forward-facing URLs: `/api/az`, `/az`, `/az-schedule`; no vendor names.
|
||||
- `/scraper-schedule`: add filters/search, keep as master view for all schedules; reflect platform ID/menu_type status and controls (resolve ID, run now, enable/disable/delete).
|
||||
|
||||
8) **No slug guessing**
|
||||
- Do not guess slugs; use the DB record’s `menu_url` and ID. Resolve platform ID from the URL/cName; if set, crawl directly by ID.
|
||||
|
||||
9) **Verify locally before pushing**
|
||||
- Apply migrations, restart backend, ensure auth (`users` table) exists, run dutchie crawl for a known dispensary (e.g., Deeply Rooted), check `/api/az/dashboard`, `/api/az/stores/:id/products`, `/az`, `/scraper-schedule`.
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
-- Migration: Add dispensary_id to wp_dutchie_api_permissions
|
||||
-- This allows API tokens to be associated with a specific dispensary
|
||||
|
||||
-- Add dispensary_id column to wp_dutchie_api_permissions
|
||||
ALTER TABLE wp_dutchie_api_permissions
|
||||
ADD COLUMN IF NOT EXISTS dispensary_id INTEGER REFERENCES dispensaries(id);
|
||||
|
||||
-- Add index for faster lookups
|
||||
CREATE INDEX IF NOT EXISTS idx_wp_api_permissions_dispensary_id ON wp_dutchie_api_permissions(dispensary_id);
|
||||
|
||||
-- Add dispensary_name column to return dispensary info without join
|
||||
ALTER TABLE wp_dutchie_api_permissions
|
||||
ADD COLUMN IF NOT EXISTS dispensary_name VARCHAR(255);
|
||||
@@ -896,6 +896,7 @@ SELECT
|
||||
subcategory,
|
||||
COUNT(*) as product_count,
|
||||
COUNT(DISTINCT dispensary_id) as dispensary_count,
|
||||
COUNT(DISTINCT brand_name) as brand_count,
|
||||
AVG(thc) as avg_thc,
|
||||
MIN(thc) as min_thc,
|
||||
MAX(thc) as max_thc
|
||||
|
||||
36
backend/migrations/031_product_normalized_fields.sql
Normal file
36
backend/migrations/031_product_normalized_fields.sql
Normal file
@@ -0,0 +1,36 @@
|
||||
-- Migration 031: Add Normalized Fields to Products
|
||||
-- For improved product matching and deduplication
|
||||
|
||||
-- Add normalized columns to products table
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS name_normalized VARCHAR(500);
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS brand_normalized VARCHAR(255);
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS external_id VARCHAR(255); -- Platform-specific ID (Dutchie, Treez, etc)
|
||||
ALTER TABLE products ADD COLUMN IF NOT EXISTS source_platform VARCHAR(50); -- 'dutchie', 'treez', 'jane', 'wp'
|
||||
|
||||
-- Create indexes for efficient matching
|
||||
CREATE INDEX IF NOT EXISTS idx_products_external_id ON products(external_id) WHERE external_id IS NOT NULL;
|
||||
CREATE INDEX IF NOT EXISTS idx_products_name_normalized ON products(store_id, name_normalized);
|
||||
CREATE INDEX IF NOT EXISTS idx_products_matching ON products(store_id, name_normalized, brand_normalized, category_id);
|
||||
|
||||
-- Backfill normalized names for existing products
|
||||
UPDATE products SET
|
||||
name_normalized = LOWER(TRIM(REGEXP_REPLACE(name, '[^a-zA-Z0-9 ]', ' ', 'g'))),
|
||||
brand_normalized = LOWER(TRIM(COALESCE(brand, ''))),
|
||||
external_id = COALESCE(external_id, dutchie_product_id),
|
||||
source_platform = COALESCE(source_platform, 'dutchie')
|
||||
WHERE name_normalized IS NULL;
|
||||
|
||||
-- Add constraint to prevent true duplicates going forward
|
||||
-- Note: We use a partial unique index to allow multiple NULLs
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_products_no_duplicate_external_id
|
||||
ON products(store_id, external_id)
|
||||
WHERE external_id IS NOT NULL;
|
||||
|
||||
-- Comments
|
||||
COMMENT ON COLUMN products.name_normalized IS 'Lowercase, trimmed product name with punctuation removed for matching';
|
||||
COMMENT ON COLUMN products.brand_normalized IS 'Lowercase, trimmed brand name for matching';
|
||||
COMMENT ON COLUMN products.external_id IS 'Platform-specific product ID (Dutchie ID, Treez SKU, etc)';
|
||||
COMMENT ON COLUMN products.source_platform IS 'Source platform: dutchie, treez, jane, wp';
|
||||
|
||||
-- Grant permissions
|
||||
GRANT SELECT, INSERT, UPDATE ON products TO scraper;
|
||||
61
backend/migrations/032_menu_type_and_local_images.sql
Normal file
61
backend/migrations/032_menu_type_and_local_images.sql
Normal file
@@ -0,0 +1,61 @@
|
||||
-- Migration 032: Add menu_type column and local image storage columns
|
||||
-- Run with: psql $DATABASE_URL -f migrations/032_menu_type_and_local_images.sql
|
||||
|
||||
-- ============================================
|
||||
-- 1. Add menu_type column to dispensaries
|
||||
-- ============================================
|
||||
|
||||
-- menu_type: canonical, admin-editable field for menu provider type
|
||||
-- Separate from menu_provider (auto-detected) to allow manual override
|
||||
ALTER TABLE dispensaries ADD COLUMN IF NOT EXISTS menu_type VARCHAR(50);
|
||||
|
||||
-- Index for filtering by menu_type
|
||||
CREATE INDEX IF NOT EXISTS idx_dispensaries_menu_type ON dispensaries(menu_type) WHERE menu_type IS NOT NULL;
|
||||
|
||||
-- Backfill menu_type from existing detection data:
|
||||
-- Priority: menu_provider (if set) > menu_url pattern matching
|
||||
UPDATE dispensaries
|
||||
SET menu_type = CASE
|
||||
-- Use existing menu_provider if set
|
||||
WHEN menu_provider IS NOT NULL AND menu_provider != '' THEN menu_provider
|
||||
-- Detect from menu_url patterns
|
||||
WHEN menu_url ILIKE '%dutchie%' THEN 'dutchie'
|
||||
WHEN menu_url ILIKE '%treez%' THEN 'treez'
|
||||
WHEN menu_url ILIKE '%jane%' OR menu_url ILIKE '%iheartjane%' THEN 'jane'
|
||||
WHEN menu_url ILIKE '%weedmaps%' THEN 'weedmaps'
|
||||
WHEN menu_url ILIKE '%leafly%' THEN 'leafly'
|
||||
WHEN menu_url ILIKE '%meadow%' OR menu_url ILIKE '%getmeadow%' THEN 'meadow'
|
||||
WHEN menu_url ILIKE '%blaze%' THEN 'blaze'
|
||||
WHEN menu_url ILIKE '%flowhub%' THEN 'flowhub'
|
||||
WHEN menu_url ILIKE '%dispenseapp%' THEN 'dispense'
|
||||
WHEN menu_url ILIKE '%cova%' THEN 'cova'
|
||||
ELSE NULL
|
||||
END
|
||||
WHERE menu_type IS NULL;
|
||||
|
||||
-- ============================================
|
||||
-- 2. Add local image columns to dutchie_products
|
||||
-- ============================================
|
||||
|
||||
-- local_image_url: the URL path for serving the downloaded image (e.g., /images/products/123/456.webp)
|
||||
ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS local_image_url TEXT;
|
||||
|
||||
-- local_image_thumb_url: thumbnail version
|
||||
ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS local_image_thumb_url TEXT;
|
||||
|
||||
-- local_image_medium_url: medium version
|
||||
ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS local_image_medium_url TEXT;
|
||||
|
||||
-- original_image_url: preserved third-party URL for fallback/reference
|
||||
-- (primary_image_url will be updated to local path when downloaded)
|
||||
ALTER TABLE dutchie_products ADD COLUMN IF NOT EXISTS original_image_url TEXT;
|
||||
|
||||
-- Backfill original_image_url from primary_image_url (preserve third-party URLs)
|
||||
UPDATE dutchie_products
|
||||
SET original_image_url = primary_image_url
|
||||
WHERE original_image_url IS NULL AND primary_image_url IS NOT NULL;
|
||||
|
||||
-- ============================================
|
||||
-- Done
|
||||
-- ============================================
|
||||
SELECT 'Migration 032 completed: menu_type column added, local image columns added' as status;
|
||||
63
backend/migrations/033_add_platform_id_to_crawl_status.sql
Normal file
63
backend/migrations/033_add_platform_id_to_crawl_status.sql
Normal file
@@ -0,0 +1,63 @@
|
||||
-- Migration 033: Add platform_dispensary_id to dispensary_crawl_status view
|
||||
-- This exposes platform ID status for scheduling transparency
|
||||
-- Works with both local (interval_minutes) and K8s (cron_expression) schema variants
|
||||
|
||||
-- Recreate the dispensary_crawl_status view with platform_dispensary_id
|
||||
DROP VIEW IF EXISTS public.dispensary_crawl_status CASCADE;
|
||||
CREATE OR REPLACE VIEW public.dispensary_crawl_status AS
|
||||
SELECT
|
||||
d.id AS dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) AS dispensary_name,
|
||||
d.slug AS dispensary_slug,
|
||||
d.city,
|
||||
d.state,
|
||||
d.menu_url,
|
||||
d.menu_type,
|
||||
d.platform_dispensary_id,
|
||||
d.scrape_enabled,
|
||||
d.last_crawl_at,
|
||||
d.crawl_status,
|
||||
d.product_crawler_mode,
|
||||
d.product_provider,
|
||||
cs.interval_minutes,
|
||||
cs.is_active,
|
||||
cs.priority,
|
||||
cs.last_run_at,
|
||||
cs.next_run_at,
|
||||
cs.last_status AS schedule_last_status,
|
||||
cs.last_error AS schedule_last_error,
|
||||
cs.consecutive_failures,
|
||||
j.id AS latest_job_id,
|
||||
j.status AS latest_job_status,
|
||||
j.job_type AS latest_job_type,
|
||||
j.started_at AS latest_job_started,
|
||||
j.completed_at AS latest_job_completed,
|
||||
j.products_found AS latest_products_found,
|
||||
j.products_new AS latest_products_created,
|
||||
j.products_updated AS latest_products_updated,
|
||||
j.error_message AS latest_job_error,
|
||||
-- Computed scheduling eligibility
|
||||
CASE
|
||||
WHEN d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL THEN true
|
||||
ELSE false
|
||||
END AS can_crawl,
|
||||
CASE
|
||||
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'menu_type not detected'
|
||||
WHEN d.menu_type != 'dutchie' THEN 'not dutchie platform'
|
||||
WHEN d.platform_dispensary_id IS NULL THEN 'platform ID not resolved'
|
||||
WHEN d.scrape_enabled = false THEN 'scraping disabled'
|
||||
ELSE 'ready'
|
||||
END AS schedule_status_reason
|
||||
FROM public.dispensaries d
|
||||
LEFT JOIN public.dispensary_crawl_schedule cs ON cs.dispensary_id = d.id
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT *
|
||||
FROM public.dispensary_crawl_jobs dj
|
||||
WHERE dj.dispensary_id = d.id
|
||||
ORDER BY dj.created_at DESC
|
||||
LIMIT 1
|
||||
) j ON true
|
||||
WHERE d.state = 'AZ';
|
||||
|
||||
-- Done!
|
||||
SELECT 'Migration 033 completed successfully' as status;
|
||||
@@ -1,7 +1,15 @@
|
||||
import { Pool } from 'pg';
|
||||
|
||||
// Consolidated DB connection:
|
||||
// - Prefer CRAWLSY_DATABASE_URL (e.g., crawlsy_local, crawlsy_prod)
|
||||
// - Then DATABASE_URL (default)
|
||||
const DATABASE_URL =
|
||||
process.env.CRAWLSY_DATABASE_URL ||
|
||||
process.env.DATABASE_URL ||
|
||||
'postgresql://dutchie:dutchie_local_pass@localhost:54320/crawlsy_local';
|
||||
|
||||
const pool = new Pool({
|
||||
connectionString: process.env.DATABASE_URL,
|
||||
connectionString: DATABASE_URL,
|
||||
});
|
||||
|
||||
export async function runMigrations() {
|
||||
|
||||
@@ -154,7 +154,7 @@ export async function resolvePlatformDispensaryIds(): Promise<{ resolved: number
|
||||
const { rows: dispensaries } = await query<Dispensary>(
|
||||
`
|
||||
SELECT * FROM dispensaries
|
||||
WHERE platform = 'dutchie' AND platform_dispensary_id IS NULL
|
||||
WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NULL
|
||||
ORDER BY id
|
||||
`
|
||||
);
|
||||
@@ -199,7 +199,7 @@ export async function resolvePlatformDispensaryIds(): Promise<{ resolved: number
|
||||
*/
|
||||
export async function getAllDispensaries(): Promise<Dispensary[]> {
|
||||
const { rows } = await query<Dispensary>(
|
||||
`SELECT * FROM dispensaries WHERE platform = 'dutchie' ORDER BY name`
|
||||
`SELECT * FROM dispensaries WHERE menu_type = 'dutchie' ORDER BY name`
|
||||
);
|
||||
return rows;
|
||||
}
|
||||
@@ -222,7 +222,7 @@ export async function getDispensariesWithPlatformIds(): Promise<Dispensary[]> {
|
||||
const { rows } = await query<Dispensary>(
|
||||
`
|
||||
SELECT * FROM dispensaries
|
||||
WHERE platform = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
WHERE menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY name
|
||||
`
|
||||
);
|
||||
|
||||
@@ -19,6 +19,7 @@ import {
|
||||
deriveStockStatus,
|
||||
calculateTotalQuantity,
|
||||
} from '../types';
|
||||
import { downloadProductImage, imageExists } from '../../utils/image-storage';
|
||||
|
||||
// ============================================================
|
||||
// NORMALIZATION FUNCTIONS
|
||||
@@ -348,6 +349,55 @@ async function upsertProduct(product: Partial<DutchieProduct>): Promise<number>
|
||||
return result.rows[0].id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Download product image and update local image URLs
|
||||
* Skips download if local image already exists for this product+URL combo
|
||||
*/
|
||||
async function downloadAndUpdateProductImage(
|
||||
productId: number,
|
||||
dispensaryId: number,
|
||||
externalProductId: string,
|
||||
primaryImageUrl: string | undefined
|
||||
): Promise<{ downloaded: boolean; error?: string }> {
|
||||
if (!primaryImageUrl) {
|
||||
return { downloaded: false, error: 'No image URL' };
|
||||
}
|
||||
|
||||
try {
|
||||
// Check if we already have this image locally
|
||||
const exists = await imageExists(dispensaryId, externalProductId, primaryImageUrl);
|
||||
if (exists) {
|
||||
return { downloaded: false };
|
||||
}
|
||||
|
||||
// Download and process the image
|
||||
const result = await downloadProductImage(primaryImageUrl, dispensaryId, externalProductId);
|
||||
|
||||
if (!result.success || !result.urls) {
|
||||
return { downloaded: false, error: result.error };
|
||||
}
|
||||
|
||||
// Update the product record with local image URLs
|
||||
await query(
|
||||
`
|
||||
UPDATE dutchie_products
|
||||
SET
|
||||
local_image_url = $1,
|
||||
local_image_thumb_url = $2,
|
||||
local_image_medium_url = $3,
|
||||
original_image_url = COALESCE(original_image_url, primary_image_url),
|
||||
updated_at = NOW()
|
||||
WHERE id = $4
|
||||
`,
|
||||
[result.urls.full, result.urls.thumb, result.urls.medium, productId]
|
||||
);
|
||||
|
||||
return { downloaded: true };
|
||||
} catch (error: any) {
|
||||
return { downloaded: false, error: error.message };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a snapshot record
|
||||
*/
|
||||
@@ -536,6 +586,8 @@ export interface CrawlResult {
|
||||
modeAProducts?: number;
|
||||
modeBProducts?: number;
|
||||
missingProductsMarked?: number;
|
||||
imagesDownloaded?: number;
|
||||
imageErrors?: number;
|
||||
errorMessage?: string;
|
||||
durationMs: number;
|
||||
}
|
||||
@@ -549,10 +601,14 @@ async function processProducts(
|
||||
products: DutchieRawProduct[],
|
||||
dispensary: Dispensary,
|
||||
pricingType: 'rec' | 'med',
|
||||
crawlMode: CrawlMode
|
||||
): Promise<{ upserted: number; snapshots: number; productIds: Set<string> }> {
|
||||
crawlMode: CrawlMode,
|
||||
options: { downloadImages?: boolean } = {}
|
||||
): Promise<{ upserted: number; snapshots: number; productIds: Set<string>; imagesDownloaded: number; imageErrors: number }> {
|
||||
const { downloadImages = true } = options;
|
||||
let upserted = 0;
|
||||
let snapshots = 0;
|
||||
let imagesDownloaded = 0;
|
||||
let imageErrors = 0;
|
||||
const productIds = new Set<string>();
|
||||
|
||||
for (const raw of products) {
|
||||
@@ -569,6 +625,21 @@ async function processProducts(
|
||||
const productId = await upsertProduct(normalizedProduct);
|
||||
upserted++;
|
||||
|
||||
// Download image locally if enabled
|
||||
if (downloadImages && normalizedProduct.primaryImageUrl) {
|
||||
const imageResult = await downloadAndUpdateProductImage(
|
||||
productId,
|
||||
dispensary.id,
|
||||
externalId,
|
||||
normalizedProduct.primaryImageUrl
|
||||
);
|
||||
if (imageResult.downloaded) {
|
||||
imagesDownloaded++;
|
||||
} else if (imageResult.error && imageResult.error !== 'No image URL') {
|
||||
imageErrors++;
|
||||
}
|
||||
}
|
||||
|
||||
// Create snapshot with crawl mode
|
||||
const snapshot = normalizeSnapshot(
|
||||
raw,
|
||||
@@ -585,7 +656,7 @@ async function processProducts(
|
||||
}
|
||||
}
|
||||
|
||||
return { upserted, snapshots, productIds };
|
||||
return { upserted, snapshots, productIds, imagesDownloaded, imageErrors };
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -598,9 +669,9 @@ async function processProducts(
|
||||
export async function crawlDispensaryProducts(
|
||||
dispensary: Dispensary,
|
||||
pricingType: 'rec' | 'med' = 'rec',
|
||||
options: { useBothModes?: boolean } = {}
|
||||
options: { useBothModes?: boolean; downloadImages?: boolean } = {}
|
||||
): Promise<CrawlResult> {
|
||||
const { useBothModes = true } = options;
|
||||
const { useBothModes = true, downloadImages = true } = options;
|
||||
const startTime = Date.now();
|
||||
|
||||
if (!dispensary.platformDispensaryId) {
|
||||
@@ -620,6 +691,8 @@ export async function crawlDispensaryProducts(
|
||||
|
||||
let totalUpserted = 0;
|
||||
let totalSnapshots = 0;
|
||||
let totalImagesDownloaded = 0;
|
||||
let totalImageErrors = 0;
|
||||
let modeAProducts = 0;
|
||||
let modeBProducts = 0;
|
||||
let missingMarked = 0;
|
||||
@@ -656,10 +729,13 @@ export async function crawlDispensaryProducts(
|
||||
bothResults.merged.products,
|
||||
dispensary,
|
||||
pricingType,
|
||||
'mode_a' // Use mode_a for merged products (convention)
|
||||
'mode_a', // Use mode_a for merged products (convention)
|
||||
{ downloadImages }
|
||||
);
|
||||
totalUpserted = mergedResult.upserted;
|
||||
totalSnapshots = mergedResult.snapshots;
|
||||
totalImagesDownloaded = mergedResult.imagesDownloaded;
|
||||
totalImageErrors = mergedResult.imageErrors;
|
||||
}
|
||||
} else {
|
||||
// Single mode crawl (Mode A only)
|
||||
@@ -676,9 +752,11 @@ export async function crawlDispensaryProducts(
|
||||
modeAProductIds.add(p._id);
|
||||
}
|
||||
|
||||
const result = await processProducts(products, dispensary, pricingType, crawlMode);
|
||||
const result = await processProducts(products, dispensary, pricingType, crawlMode, { downloadImages });
|
||||
totalUpserted = result.upserted;
|
||||
totalSnapshots = result.snapshots;
|
||||
totalImagesDownloaded = result.imagesDownloaded;
|
||||
totalImageErrors = result.imageErrors;
|
||||
}
|
||||
|
||||
// Mark products as missing using UNION of Mode A + Mode B
|
||||
@@ -695,7 +773,7 @@ export async function crawlDispensaryProducts(
|
||||
// Update dispensary stats
|
||||
await updateDispensaryCrawlStats(dispensary.id, totalUpserted);
|
||||
|
||||
console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing`);
|
||||
console.log(`[ProductCrawler] Completed: ${totalUpserted} products, ${totalSnapshots} snapshots, ${missingMarked} marked missing, ${totalImagesDownloaded} images downloaded`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
@@ -706,6 +784,8 @@ export async function crawlDispensaryProducts(
|
||||
modeAProducts,
|
||||
modeBProducts,
|
||||
missingProductsMarked: missingMarked,
|
||||
imagesDownloaded: totalImagesDownloaded,
|
||||
imageErrors: totalImageErrors,
|
||||
durationMs: Date.now() - startTime,
|
||||
};
|
||||
} catch (error: any) {
|
||||
@@ -734,7 +814,7 @@ export async function crawlAllArizonaDispensaries(
|
||||
const { rows: dispensaries } = await query<Dispensary>(
|
||||
`
|
||||
SELECT * FROM dispensaries
|
||||
WHERE state = 'AZ' AND platform = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY id
|
||||
`
|
||||
);
|
||||
|
||||
@@ -452,7 +452,7 @@ async function executeProductCrawl(config: Record<string, any>): Promise<{
|
||||
const { rows: dispensaries } = await query<Dispensary>(
|
||||
`
|
||||
SELECT * FROM dispensaries
|
||||
WHERE state = 'AZ' AND platform = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
WHERE state = 'AZ' AND menu_type = 'dutchie' AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY last_crawled_at ASC NULLS FIRST
|
||||
`
|
||||
);
|
||||
|
||||
@@ -3,6 +3,7 @@ import cors from 'cors';
|
||||
import path from 'path';
|
||||
import dotenv from 'dotenv';
|
||||
import { initializeMinio, isMinioEnabled } from './utils/minio';
|
||||
import { initializeImageStorage } from './utils/image-storage';
|
||||
import { logger } from './services/logger';
|
||||
import { cleanupOrphanedJobs } from './services/proxyTestQueue';
|
||||
|
||||
@@ -102,6 +103,7 @@ async function startServer() {
|
||||
logger.info('system', 'Starting server...');
|
||||
|
||||
await initializeMinio();
|
||||
await initializeImageStorage();
|
||||
logger.info('system', isMinioEnabled() ? 'MinIO storage initialized' : 'Local filesystem storage initialized');
|
||||
|
||||
// Clean up any orphaned proxy test jobs from previous server runs
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
import { Router } from 'express';
|
||||
import { authMiddleware } from '../auth/middleware';
|
||||
import { pool } from '../db/migrate';
|
||||
import { query as azQuery } from '../dutchie-az/db/connection'; // AZ pipeline DB
|
||||
|
||||
const router = Router();
|
||||
router.use(authMiddleware);
|
||||
|
||||
// Get dashboard stats - now sourced from AZ pipeline (dutchie_az DB)
|
||||
// Get dashboard stats - consolidated DB (all tables in one DB now)
|
||||
router.get('/stats', async (req, res) => {
|
||||
try {
|
||||
// Dispensary stats (AZ pipeline)
|
||||
const dispensariesResult = await azQuery(`
|
||||
// Dispensary stats
|
||||
const dispensariesResult = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE scrape_enabled = true) as active,
|
||||
@@ -21,8 +20,8 @@ router.get('/stats', async (req, res) => {
|
||||
FROM dispensaries
|
||||
`);
|
||||
|
||||
// Product stats from AZ pipeline (dutchie_products)
|
||||
const productsResult = await azQuery(`
|
||||
// Product stats from dutchie_products table
|
||||
const productsResult = await pool.query(`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE stock_status = 'in_stock') as in_stock,
|
||||
@@ -47,8 +46,8 @@ router.get('/stats', async (req, res) => {
|
||||
WHERE clicked_at >= NOW() - INTERVAL '24 hours'
|
||||
`);
|
||||
|
||||
// Recent products added (last 24 hours) from AZ pipeline
|
||||
const recentProductsResult = await azQuery(`
|
||||
// Recent products added (last 24 hours)
|
||||
const recentProductsResult = await pool.query(`
|
||||
SELECT COUNT(*) as new_products_24h
|
||||
FROM dutchie_products
|
||||
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
||||
@@ -77,7 +76,7 @@ router.get('/stats', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// Get recent activity - uses AZ data
|
||||
// Get recent activity - from consolidated DB
|
||||
router.get('/activity', async (req, res) => {
|
||||
try {
|
||||
const { limit = 20 } = req.query;
|
||||
|
||||
@@ -5,10 +5,15 @@ import { pool } from '../db/migrate';
|
||||
const router = Router();
|
||||
router.use(authMiddleware);
|
||||
|
||||
// Valid menu_type values
|
||||
const VALID_MENU_TYPES = ['dutchie', 'treez', 'jane', 'weedmaps', 'leafly', 'meadow', 'blaze', 'flowhub', 'dispense', 'cova', 'other', 'unknown'];
|
||||
|
||||
// Get all dispensaries
|
||||
router.get('/', async (req, res) => {
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
const { menu_type } = req.query;
|
||||
|
||||
let query = `
|
||||
SELECT
|
||||
id,
|
||||
azdhs_id,
|
||||
@@ -30,14 +35,29 @@ router.get('/', async (req, res) => {
|
||||
latitude,
|
||||
longitude,
|
||||
menu_url,
|
||||
menu_type,
|
||||
menu_provider,
|
||||
menu_provider_confidence,
|
||||
scraper_template,
|
||||
last_menu_scrape,
|
||||
menu_scrape_status,
|
||||
platform_dispensary_id,
|
||||
created_at,
|
||||
updated_at
|
||||
FROM dispensaries
|
||||
ORDER BY name
|
||||
`);
|
||||
`;
|
||||
|
||||
const params: any[] = [];
|
||||
|
||||
// Filter by menu_type if provided
|
||||
if (menu_type) {
|
||||
query += ` WHERE menu_type = $1`;
|
||||
params.push(menu_type);
|
||||
}
|
||||
|
||||
query += ` ORDER BY name`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
|
||||
res.json({ dispensaries: result.rows });
|
||||
} catch (error) {
|
||||
@@ -46,6 +66,22 @@ router.get('/', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// Get menu type stats
|
||||
router.get('/stats/menu-types', async (req, res) => {
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT menu_type, COUNT(*) as count
|
||||
FROM dispensaries
|
||||
GROUP BY menu_type
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
res.json({ menu_types: result.rows, valid_types: VALID_MENU_TYPES });
|
||||
} catch (error) {
|
||||
console.error('Error fetching menu type stats:', error);
|
||||
res.status(500).json({ error: 'Failed to fetch menu type stats' });
|
||||
}
|
||||
});
|
||||
|
||||
// Get single dispensary by slug
|
||||
router.get('/:slug', async (req, res) => {
|
||||
try {
|
||||
@@ -73,10 +109,14 @@ router.get('/:slug', async (req, res) => {
|
||||
latitude,
|
||||
longitude,
|
||||
menu_url,
|
||||
menu_type,
|
||||
menu_provider,
|
||||
menu_provider_confidence,
|
||||
scraper_template,
|
||||
scraper_config,
|
||||
last_menu_scrape,
|
||||
menu_scrape_status,
|
||||
platform_dispensary_id,
|
||||
created_at,
|
||||
updated_at
|
||||
FROM dispensaries
|
||||
@@ -106,11 +146,19 @@ router.put('/:id', async (req, res) => {
|
||||
google_rating,
|
||||
google_review_count,
|
||||
menu_url,
|
||||
menu_type,
|
||||
scraper_template,
|
||||
scraper_config,
|
||||
menu_scrape_status
|
||||
} = req.body;
|
||||
|
||||
// Validate menu_type if provided
|
||||
if (menu_type !== undefined && menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
|
||||
return res.status(400).json({
|
||||
error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')}`
|
||||
});
|
||||
}
|
||||
|
||||
const result = await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET
|
||||
@@ -121,11 +169,12 @@ router.put('/:id', async (req, res) => {
|
||||
google_rating = COALESCE($5, google_rating),
|
||||
google_review_count = COALESCE($6, google_review_count),
|
||||
menu_url = COALESCE($7, menu_url),
|
||||
scraper_template = COALESCE($8, scraper_template),
|
||||
scraper_config = COALESCE($9, scraper_config),
|
||||
menu_scrape_status = COALESCE($10, menu_scrape_status),
|
||||
menu_type = COALESCE($8, menu_type),
|
||||
scraper_template = COALESCE($9, scraper_template),
|
||||
scraper_config = COALESCE($10, scraper_config),
|
||||
menu_scrape_status = COALESCE($11, menu_scrape_status),
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $11
|
||||
WHERE id = $12
|
||||
RETURNING *
|
||||
`, [
|
||||
dba_name,
|
||||
@@ -135,6 +184,7 @@ router.put('/:id', async (req, res) => {
|
||||
google_rating,
|
||||
google_review_count,
|
||||
menu_url,
|
||||
menu_type,
|
||||
scraper_template,
|
||||
scraper_config,
|
||||
menu_scrape_status,
|
||||
@@ -384,4 +434,72 @@ router.post('/:slug/scrape', async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
// Update menu_type for a dispensary (dedicated endpoint)
|
||||
router.patch('/:id/menu-type', async (req, res) => {
|
||||
try {
|
||||
const { id } = req.params;
|
||||
const { menu_type } = req.body;
|
||||
|
||||
// Validate menu_type
|
||||
if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
|
||||
return res.status(400).json({
|
||||
error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)`
|
||||
});
|
||||
}
|
||||
|
||||
const result = await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET menu_type = $1, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = $2
|
||||
RETURNING id, name, slug, menu_type, menu_provider, menu_url
|
||||
`, [menu_type || null, id]);
|
||||
|
||||
if (result.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
dispensary: result.rows[0]
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error updating menu_type:', error);
|
||||
res.status(500).json({ error: 'Failed to update menu_type' });
|
||||
}
|
||||
});
|
||||
|
||||
// Bulk update menu_type for multiple dispensaries
|
||||
router.post('/bulk/menu-type', async (req, res) => {
|
||||
try {
|
||||
const { dispensary_ids, menu_type } = req.body;
|
||||
|
||||
if (!Array.isArray(dispensary_ids) || dispensary_ids.length === 0) {
|
||||
return res.status(400).json({ error: 'dispensary_ids must be a non-empty array' });
|
||||
}
|
||||
|
||||
// Validate menu_type
|
||||
if (menu_type !== null && menu_type !== '' && !VALID_MENU_TYPES.includes(menu_type)) {
|
||||
return res.status(400).json({
|
||||
error: `Invalid menu_type. Must be one of: ${VALID_MENU_TYPES.join(', ')} (or null to clear)`
|
||||
});
|
||||
}
|
||||
|
||||
const result = await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET menu_type = $1, updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ANY($2::int[])
|
||||
RETURNING id, name, slug, menu_type
|
||||
`, [menu_type || null, dispensary_ids]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
updated_count: result.rowCount,
|
||||
dispensaries: result.rows
|
||||
});
|
||||
} catch (error) {
|
||||
console.error('Error bulk updating menu_type:', error);
|
||||
res.status(500).json({ error: 'Failed to bulk update menu_type' });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
|
||||
@@ -27,6 +27,7 @@ import {
|
||||
ensureAllDispensariesHaveSchedules,
|
||||
} from '../services/dispensary-orchestrator';
|
||||
import { pool } from '../db/migrate';
|
||||
import { resolveDispensaryId } from '../dutchie-az/services/graphql-client';
|
||||
|
||||
const router = Router();
|
||||
router.use(authMiddleware);
|
||||
@@ -354,14 +355,91 @@ router.get('/due', async (req: Request, res: Response) => {
|
||||
|
||||
/**
|
||||
* GET /api/schedule/dispensaries
|
||||
* Get all dispensary schedule statuses (uses the view)
|
||||
* Get all dispensary schedule statuses with optional filters
|
||||
* Query params:
|
||||
* - state: filter by state (e.g., 'AZ')
|
||||
* - search: search by name or slug
|
||||
*/
|
||||
router.get('/dispensaries', async (req: Request, res: Response) => {
|
||||
try {
|
||||
const result = await pool.query(`
|
||||
SELECT * FROM dispensary_crawl_status
|
||||
ORDER BY priority DESC, dispensary_name
|
||||
`);
|
||||
const { state, search } = req.query;
|
||||
|
||||
// Build dynamic query with optional filters
|
||||
const conditions: string[] = [];
|
||||
const params: any[] = [];
|
||||
let paramIndex = 1;
|
||||
|
||||
if (state) {
|
||||
conditions.push(`d.state = $${paramIndex}`);
|
||||
params.push(state);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
if (search) {
|
||||
conditions.push(`(d.name ILIKE $${paramIndex} OR d.slug ILIKE $${paramIndex} OR d.dba_name ILIKE $${paramIndex})`);
|
||||
params.push(`%${search}%`);
|
||||
paramIndex++;
|
||||
}
|
||||
|
||||
const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
|
||||
|
||||
const query = `
|
||||
SELECT
|
||||
d.id AS dispensary_id,
|
||||
COALESCE(d.dba_name, d.name) AS dispensary_name,
|
||||
d.slug AS dispensary_slug,
|
||||
d.city,
|
||||
d.state,
|
||||
d.menu_url,
|
||||
d.menu_type,
|
||||
d.platform_dispensary_id,
|
||||
d.scrape_enabled,
|
||||
d.last_crawl_at,
|
||||
d.crawl_status,
|
||||
d.product_crawler_mode,
|
||||
d.product_provider,
|
||||
cs.interval_minutes,
|
||||
cs.is_active,
|
||||
cs.priority,
|
||||
cs.last_run_at,
|
||||
cs.next_run_at,
|
||||
cs.last_status AS schedule_last_status,
|
||||
cs.last_error AS schedule_last_error,
|
||||
cs.consecutive_failures,
|
||||
j.id AS latest_job_id,
|
||||
j.status AS latest_job_status,
|
||||
j.job_type AS latest_job_type,
|
||||
j.started_at AS latest_job_started,
|
||||
j.completed_at AS latest_job_completed,
|
||||
j.products_found AS latest_products_found,
|
||||
j.products_new AS latest_products_created,
|
||||
j.products_updated AS latest_products_updated,
|
||||
j.error_message AS latest_job_error,
|
||||
CASE
|
||||
WHEN d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL THEN true
|
||||
ELSE false
|
||||
END AS can_crawl,
|
||||
CASE
|
||||
WHEN d.menu_type IS NULL OR d.menu_type = 'unknown' THEN 'menu_type not detected'
|
||||
WHEN d.menu_type != 'dutchie' THEN 'not dutchie platform'
|
||||
WHEN d.platform_dispensary_id IS NULL THEN 'platform ID not resolved'
|
||||
WHEN d.scrape_enabled = false THEN 'scraping disabled'
|
||||
ELSE 'ready'
|
||||
END AS schedule_status_reason
|
||||
FROM public.dispensaries d
|
||||
LEFT JOIN public.dispensary_crawl_schedule cs ON cs.dispensary_id = d.id
|
||||
LEFT JOIN LATERAL (
|
||||
SELECT *
|
||||
FROM public.dispensary_crawl_jobs dj
|
||||
WHERE dj.dispensary_id = d.id
|
||||
ORDER BY dj.created_at DESC
|
||||
LIMIT 1
|
||||
) j ON true
|
||||
${whereClause}
|
||||
ORDER BY cs.priority DESC NULLS LAST, COALESCE(d.dba_name, d.name)
|
||||
`;
|
||||
|
||||
const result = await pool.query(query, params);
|
||||
res.json({ dispensaries: result.rows });
|
||||
} catch (error: any) {
|
||||
console.error('Error fetching dispensary schedules:', error);
|
||||
@@ -589,4 +667,319 @@ router.post('/dispensaries/bootstrap', requireRole('superadmin', 'admin'), async
|
||||
}
|
||||
});
|
||||
|
||||
// ============================================
|
||||
// Platform ID & Menu Type Detection Endpoints
|
||||
// ============================================
|
||||
|
||||
/**
|
||||
* POST /api/schedule/dispensaries/:id/resolve-platform-id
|
||||
* Resolve the Dutchie platform_dispensary_id from menu_url slug
|
||||
*/
|
||||
router.post('/dispensaries/:id/resolve-platform-id', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
|
||||
// Get dispensary info
|
||||
const dispensaryResult = await pool.query(`
|
||||
SELECT id, name, slug, menu_url, menu_type, platform_dispensary_id
|
||||
FROM dispensaries WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
|
||||
const dispensary = dispensaryResult.rows[0];
|
||||
|
||||
// Check if already resolved
|
||||
if (dispensary.platform_dispensary_id) {
|
||||
return res.json({
|
||||
success: true,
|
||||
message: 'Platform ID already resolved',
|
||||
platform_dispensary_id: dispensary.platform_dispensary_id,
|
||||
already_resolved: true
|
||||
});
|
||||
}
|
||||
|
||||
// Extract slug from menu_url for Dutchie URLs
|
||||
let slugToResolve = dispensary.slug;
|
||||
if (dispensary.menu_url) {
|
||||
// Match embedded-menu or dispensary URLs
|
||||
const match = dispensary.menu_url.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i);
|
||||
if (match) {
|
||||
slugToResolve = match[1];
|
||||
}
|
||||
}
|
||||
|
||||
if (!slugToResolve) {
|
||||
return res.status(400).json({
|
||||
error: 'No slug available to resolve platform ID',
|
||||
menu_url: dispensary.menu_url
|
||||
});
|
||||
}
|
||||
|
||||
console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`);
|
||||
|
||||
// Resolve platform ID using GraphQL client
|
||||
const platformId = await resolveDispensaryId(slugToResolve);
|
||||
|
||||
if (!platformId) {
|
||||
return res.status(404).json({
|
||||
error: 'Could not resolve platform ID',
|
||||
slug_tried: slugToResolve,
|
||||
message: 'The dispensary might not be on Dutchie or the slug is incorrect'
|
||||
});
|
||||
}
|
||||
|
||||
// Update the dispensary with resolved platform ID
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET platform_dispensary_id = $1,
|
||||
menu_type = COALESCE(menu_type, 'dutchie'),
|
||||
updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [platformId, dispensaryId]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
platform_dispensary_id: platformId,
|
||||
slug_resolved: slugToResolve,
|
||||
message: `Platform ID resolved: ${platformId}`
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Error resolving platform ID:', error);
|
||||
res.status(500).json({ error: 'Failed to resolve platform ID', details: error.message });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/schedule/dispensaries/:id/detect-menu-type
|
||||
* Detect menu type from menu_url
|
||||
*/
|
||||
router.post('/dispensaries/:id/detect-menu-type', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
|
||||
// Get dispensary info
|
||||
const dispensaryResult = await pool.query(`
|
||||
SELECT id, name, menu_url, website FROM dispensaries WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
|
||||
const dispensary = dispensaryResult.rows[0];
|
||||
const urlToCheck = dispensary.menu_url || dispensary.website;
|
||||
|
||||
if (!urlToCheck) {
|
||||
return res.status(400).json({ error: 'No menu_url or website to detect from' });
|
||||
}
|
||||
|
||||
// Detect menu type from URL patterns
|
||||
let detectedType: string = 'unknown';
|
||||
|
||||
if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) {
|
||||
detectedType = 'dutchie';
|
||||
} else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) {
|
||||
detectedType = 'jane';
|
||||
} else if (urlToCheck.includes('weedmaps.com')) {
|
||||
detectedType = 'weedmaps';
|
||||
} else if (urlToCheck.includes('leafly.com')) {
|
||||
detectedType = 'leafly';
|
||||
} else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) {
|
||||
detectedType = 'treez';
|
||||
} else if (urlToCheck.includes('meadow.com')) {
|
||||
detectedType = 'meadow';
|
||||
} else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) {
|
||||
detectedType = 'blaze';
|
||||
} else if (urlToCheck.includes('flowhub.com')) {
|
||||
detectedType = 'flowhub';
|
||||
} else if (urlToCheck.includes('dispense.app')) {
|
||||
detectedType = 'dispense';
|
||||
} else if (urlToCheck.includes('covasoft.com')) {
|
||||
detectedType = 'cova';
|
||||
}
|
||||
|
||||
// Update menu_type
|
||||
await pool.query(`
|
||||
UPDATE dispensaries
|
||||
SET menu_type = $1, updated_at = NOW()
|
||||
WHERE id = $2
|
||||
`, [detectedType, dispensaryId]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
menu_type: detectedType,
|
||||
url_checked: urlToCheck,
|
||||
message: `Menu type detected: ${detectedType}`
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Error detecting menu type:', error);
|
||||
res.status(500).json({ error: 'Failed to detect menu type' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* POST /api/schedule/dispensaries/:id/refresh-detection
|
||||
* Combined: detect menu_type AND resolve platform_dispensary_id if dutchie
|
||||
*/
|
||||
router.post('/dispensaries/:id/refresh-detection', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
|
||||
// Get dispensary info
|
||||
const dispensaryResult = await pool.query(`
|
||||
SELECT id, name, slug, menu_url, website FROM dispensaries WHERE id = $1
|
||||
`, [dispensaryId]);
|
||||
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
return res.status(404).json({ error: 'Dispensary not found' });
|
||||
}
|
||||
|
||||
const dispensary = dispensaryResult.rows[0];
|
||||
const urlToCheck = dispensary.menu_url || dispensary.website;
|
||||
|
||||
if (!urlToCheck) {
|
||||
return res.status(400).json({ error: 'No menu_url or website to detect from' });
|
||||
}
|
||||
|
||||
// Detect menu type from URL patterns
|
||||
let detectedType: string = 'unknown';
|
||||
|
||||
if (urlToCheck.includes('dutchie.com') || urlToCheck.includes('embedded-menu')) {
|
||||
detectedType = 'dutchie';
|
||||
} else if (urlToCheck.includes('iheartjane.com') || urlToCheck.includes('jane.co')) {
|
||||
detectedType = 'jane';
|
||||
} else if (urlToCheck.includes('weedmaps.com')) {
|
||||
detectedType = 'weedmaps';
|
||||
} else if (urlToCheck.includes('leafly.com')) {
|
||||
detectedType = 'leafly';
|
||||
} else if (urlToCheck.includes('treez.io') || urlToCheck.includes('treez.co')) {
|
||||
detectedType = 'treez';
|
||||
} else if (urlToCheck.includes('meadow.com')) {
|
||||
detectedType = 'meadow';
|
||||
} else if (urlToCheck.includes('blaze.me') || urlToCheck.includes('blazepay')) {
|
||||
detectedType = 'blaze';
|
||||
} else if (urlToCheck.includes('flowhub.com')) {
|
||||
detectedType = 'flowhub';
|
||||
} else if (urlToCheck.includes('dispense.app')) {
|
||||
detectedType = 'dispense';
|
||||
} else if (urlToCheck.includes('covasoft.com')) {
|
||||
detectedType = 'cova';
|
||||
}
|
||||
|
||||
// Update menu_type first
|
||||
await pool.query(`
|
||||
UPDATE dispensaries SET menu_type = $1, updated_at = NOW() WHERE id = $2
|
||||
`, [detectedType, dispensaryId]);
|
||||
|
||||
let platformId: string | null = null;
|
||||
|
||||
// If dutchie, also try to resolve platform ID
|
||||
if (detectedType === 'dutchie') {
|
||||
let slugToResolve = dispensary.slug;
|
||||
const match = urlToCheck.match(/(?:embedded-menu|dispensar(?:y|ies))\/([^\/\?#]+)/i);
|
||||
if (match) {
|
||||
slugToResolve = match[1];
|
||||
}
|
||||
|
||||
if (slugToResolve) {
|
||||
try {
|
||||
console.log(`[Schedule] Resolving platform ID for ${dispensary.name} using slug: ${slugToResolve}`);
|
||||
platformId = await resolveDispensaryId(slugToResolve);
|
||||
|
||||
if (platformId) {
|
||||
await pool.query(`
|
||||
UPDATE dispensaries SET platform_dispensary_id = $1, updated_at = NOW() WHERE id = $2
|
||||
`, [platformId, dispensaryId]);
|
||||
}
|
||||
} catch (err: any) {
|
||||
console.warn(`[Schedule] Failed to resolve platform ID: ${err.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
menu_type: detectedType,
|
||||
platform_dispensary_id: platformId,
|
||||
url_checked: urlToCheck,
|
||||
can_crawl: detectedType === 'dutchie' && !!platformId
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Error refreshing detection:', error);
|
||||
res.status(500).json({ error: 'Failed to refresh detection' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* PUT /api/schedule/dispensaries/:id/toggle-active
|
||||
* Enable or disable schedule for a dispensary
|
||||
*/
|
||||
router.put('/dispensaries/:id/toggle-active', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
|
||||
const { is_active } = req.body;
|
||||
|
||||
// Upsert schedule with new is_active value
|
||||
const result = await pool.query(`
|
||||
INSERT INTO dispensary_crawl_schedule (dispensary_id, is_active, interval_minutes, priority)
|
||||
VALUES ($1, $2, 240, 0)
|
||||
ON CONFLICT (dispensary_id) DO UPDATE SET
|
||||
is_active = $2,
|
||||
updated_at = NOW()
|
||||
RETURNING *
|
||||
`, [dispensaryId, is_active]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
schedule: result.rows[0],
|
||||
message: is_active ? 'Schedule enabled' : 'Schedule disabled'
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Error toggling schedule active status:', error);
|
||||
res.status(500).json({ error: 'Failed to toggle schedule' });
|
||||
}
|
||||
});
|
||||
|
||||
/**
|
||||
* DELETE /api/schedule/dispensaries/:id/schedule
|
||||
* Delete schedule for a dispensary
|
||||
*/
|
||||
router.delete('/dispensaries/:id/schedule', requireRole('superadmin', 'admin'), async (req: Request, res: Response) => {
|
||||
try {
|
||||
const dispensaryId = parseInt(req.params.id);
|
||||
if (isNaN(dispensaryId)) {
|
||||
return res.status(400).json({ error: 'Invalid dispensary ID' });
|
||||
}
|
||||
|
||||
const result = await pool.query(`
|
||||
DELETE FROM dispensary_crawl_schedule WHERE dispensary_id = $1 RETURNING id
|
||||
`, [dispensaryId]);
|
||||
|
||||
res.json({
|
||||
success: true,
|
||||
deleted: result.rowCount > 0,
|
||||
message: result.rowCount > 0 ? 'Schedule deleted' : 'No schedule to delete'
|
||||
});
|
||||
} catch (error: any) {
|
||||
console.error('Error deleting schedule:', error);
|
||||
res.status(500).json({ error: 'Failed to delete schedule' });
|
||||
}
|
||||
});
|
||||
|
||||
export default router;
|
||||
|
||||
@@ -425,7 +425,29 @@ export class DutchieSpider {
|
||||
href = window.location.origin + href;
|
||||
}
|
||||
|
||||
items.push({ name, price, originalPrice, href });
|
||||
// Extract image URL from product card
|
||||
let imageUrl = null;
|
||||
const imgSelectors = [
|
||||
'img[src*="images.dutchie.com"]',
|
||||
'img[src*="dutchie"]',
|
||||
'img[data-testid*="product"]',
|
||||
'img[class*="product"]',
|
||||
'img[class*="Product"]',
|
||||
'picture img',
|
||||
'img'
|
||||
];
|
||||
for (const sel of imgSelectors) {
|
||||
const img = card.querySelector(sel);
|
||||
if (img) {
|
||||
const src = img.getAttribute('src') || img.getAttribute('data-src') || '';
|
||||
if (src && (src.includes('dutchie.com') || src.includes('images.'))) {
|
||||
imageUrl = src;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
items.push({ name, price, originalPrice, href, imageUrl });
|
||||
|
||||
} catch (err) {
|
||||
console.error('Error parsing product card:', err);
|
||||
@@ -447,6 +469,7 @@ export class DutchieSpider {
|
||||
productName: card.name,
|
||||
productPrice: card.price,
|
||||
productOriginalPrice: card.originalPrice,
|
||||
productImageUrl: card.imageUrl, // Pass image from category page
|
||||
requiresBrowser: true
|
||||
},
|
||||
callback: this.parseProductPage.bind(this)
|
||||
@@ -472,21 +495,27 @@ export class DutchieSpider {
|
||||
// @ts-ignore - runs in browser context
|
||||
const allText = document.body.textContent || '';
|
||||
|
||||
// Extract image
|
||||
// Extract image - expanded selectors for better coverage
|
||||
let fullSizeImage = null;
|
||||
const mainImageSelectors = [
|
||||
'img[src*="images.dutchie.com"]',
|
||||
'img[src*="dutchie"]',
|
||||
'img[class*="ProductImage"]',
|
||||
'img[class*="product-image"]',
|
||||
'img[class*="Product"]',
|
||||
'[class*="ImageGallery"] img',
|
||||
'main img',
|
||||
'img[src*="images.dutchie.com"]'
|
||||
'[data-testid*="product"] img',
|
||||
'[data-testid*="image"] img',
|
||||
'picture img',
|
||||
'main img'
|
||||
];
|
||||
|
||||
for (const sel of mainImageSelectors) {
|
||||
// @ts-ignore - runs in browser context
|
||||
const img = document.querySelector(sel) as any;
|
||||
if (img?.src && img.src.includes('dutchie.com')) {
|
||||
fullSizeImage = img.src;
|
||||
const src = img?.src || img?.getAttribute('data-src') || '';
|
||||
if (src && (src.includes('dutchie.com') || src.includes('images.'))) {
|
||||
fullSizeImage = src;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -593,6 +622,9 @@ export class DutchieSpider {
|
||||
});
|
||||
|
||||
// Create product item
|
||||
// Use image from product page, fallback to category page image
|
||||
const imageUrl = details.fullSizeImage || response.request.metadata.productImageUrl || undefined;
|
||||
|
||||
const product: Product & { storeId: number; categoryId: number } = {
|
||||
dutchieProductId: `${response.request.metadata.storeSlug}-${response.request.metadata.categorySlug}-${Date.now()}-${Math.random()}`,
|
||||
name: productName || 'Unknown Product',
|
||||
@@ -603,7 +635,7 @@ export class DutchieSpider {
|
||||
cbdPercentage: details.cbd || undefined,
|
||||
strainType: details.strainType || undefined,
|
||||
brand: details.brand || undefined,
|
||||
imageUrl: details.fullSizeImage || undefined,
|
||||
imageUrl: imageUrl,
|
||||
dutchieUrl: response.url,
|
||||
metadata: {
|
||||
terpenes: details.terpenes,
|
||||
|
||||
@@ -1,6 +1,13 @@
|
||||
/**
|
||||
* Scraper V2 - Scrapy-inspired web scraping framework
|
||||
*
|
||||
* IMPORTANT: For Dutchie stores, DO NOT USE scrapeStore() from this module.
|
||||
* Dutchie crawling must go through the dutchie-az GraphQL pipeline:
|
||||
* src/dutchie-az/services/product-crawler.ts
|
||||
*
|
||||
* This scraper-v2 module uses DOM-based extraction which is unreliable
|
||||
* for Dutchie. The new dutchie-az pipeline uses GraphQL directly.
|
||||
*
|
||||
* Architecture:
|
||||
* - Engine: Main orchestrator
|
||||
* - Scheduler: Priority queue with deduplication
|
||||
|
||||
@@ -2,6 +2,7 @@ import { ItemPipeline, Product } from './types';
|
||||
import { logger } from '../services/logger';
|
||||
import { pool } from '../db/migrate';
|
||||
import { uploadImageFromUrl } from '../utils/minio';
|
||||
import { normalizeProductName, normalizeBrandName } from '../utils/product-normalizer';
|
||||
|
||||
/**
|
||||
* Validation Pipeline - ensures data quality
|
||||
@@ -166,12 +167,25 @@ function generateSlug(name: string): string {
|
||||
}
|
||||
|
||||
/**
|
||||
* Database Pipeline - saves items to database
|
||||
* Database Pipeline - saves items to database with improved matching
|
||||
*
|
||||
* MATCHING PRIORITY:
|
||||
* 1. external_id (dutchie_product_id) - exact match
|
||||
* 2. normalized name + brand + category - strong match
|
||||
* 3. normalized name + category - weak match (same product, different/missing brand)
|
||||
*
|
||||
* ALWAYS creates a snapshot after upsert for historical tracking.
|
||||
*/
|
||||
export class DatabasePipeline implements ItemPipeline<Product> {
|
||||
name = 'DatabasePipeline';
|
||||
priority = 10; // Low priority - runs last
|
||||
|
||||
private crawlId: string | null = null;
|
||||
|
||||
setCrawlId(id: string): void {
|
||||
this.crawlId = id;
|
||||
}
|
||||
|
||||
async process(item: Product, spider: string): Promise<Product | null> {
|
||||
const client = await pool.connect();
|
||||
|
||||
@@ -180,78 +194,155 @@ export class DatabasePipeline implements ItemPipeline<Product> {
|
||||
const storeId = (item as any).storeId;
|
||||
const categoryId = (item as any).categoryId;
|
||||
const dispensaryId = (item as any).dispensaryId;
|
||||
const categoryName = (item as any).categoryName;
|
||||
|
||||
// Generate slug from name
|
||||
// Generate normalized values for matching
|
||||
const nameNormalized = normalizeProductName(item.name);
|
||||
const brandNormalized = normalizeBrandName(item.brand);
|
||||
const slug = generateSlug(item.name);
|
||||
const externalId = item.dutchieProductId || null;
|
||||
|
||||
if (!storeId || !categoryId) {
|
||||
logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check if product exists
|
||||
const existingResult = await client.query(`
|
||||
let productId: number | null = null;
|
||||
let localImagePath: string | null = null;
|
||||
let isNewProduct = false;
|
||||
|
||||
// STEP 1: Try to match by external_id (most reliable)
|
||||
if (externalId) {
|
||||
const extMatch = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
FROM products
|
||||
WHERE store_id = $1 AND (external_id = $2 OR dutchie_product_id = $2)
|
||||
`, [storeId, externalId]);
|
||||
|
||||
if (extMatch.rows.length > 0) {
|
||||
productId = extMatch.rows[0].id;
|
||||
localImagePath = extMatch.rows[0].local_image_path;
|
||||
logger.debug('pipeline', `Matched by external_id: ${item.name}`);
|
||||
}
|
||||
}
|
||||
|
||||
// STEP 2: Try to match by normalized name + brand + category
|
||||
if (!productId) {
|
||||
const normMatch = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
FROM products
|
||||
WHERE store_id = $1
|
||||
AND name_normalized = $2
|
||||
AND brand_normalized = $3
|
||||
AND category_id = $4
|
||||
`, [storeId, nameNormalized, brandNormalized, categoryId]);
|
||||
|
||||
if (normMatch.rows.length > 0) {
|
||||
productId = normMatch.rows[0].id;
|
||||
localImagePath = normMatch.rows[0].local_image_path;
|
||||
logger.debug('pipeline', `Matched by normalized name+brand+category: ${item.name}`);
|
||||
}
|
||||
}
|
||||
|
||||
// STEP 3: Fallback to normalized name + category only (weaker match)
|
||||
if (!productId) {
|
||||
const weakMatch = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
FROM products
|
||||
WHERE store_id = $1
|
||||
AND name_normalized = $2
|
||||
AND category_id = $3
|
||||
LIMIT 1
|
||||
`, [storeId, nameNormalized, categoryId]);
|
||||
|
||||
if (weakMatch.rows.length === 1) {
|
||||
productId = weakMatch.rows[0].id;
|
||||
localImagePath = weakMatch.rows[0].local_image_path;
|
||||
logger.debug('pipeline', `Matched by normalized name+category: ${item.name}`);
|
||||
}
|
||||
}
|
||||
|
||||
// STEP 4: Final fallback - exact name match (legacy compatibility)
|
||||
if (!productId) {
|
||||
const exactMatch = await client.query(`
|
||||
SELECT id, image_url, local_image_path
|
||||
FROM products
|
||||
WHERE store_id = $1 AND name = $2 AND category_id = $3
|
||||
`, [storeId, item.name, categoryId]);
|
||||
|
||||
let localImagePath = null;
|
||||
let productId: number;
|
||||
if (exactMatch.rows.length > 0) {
|
||||
productId = exactMatch.rows[0].id;
|
||||
localImagePath = exactMatch.rows[0].local_image_path;
|
||||
logger.debug('pipeline', `Matched by exact name: ${item.name}`);
|
||||
}
|
||||
}
|
||||
|
||||
if (existingResult.rows.length > 0) {
|
||||
// UPDATE or INSERT
|
||||
if (productId) {
|
||||
// Update existing product
|
||||
productId = existingResult.rows[0].id;
|
||||
localImagePath = existingResult.rows[0].local_image_path;
|
||||
|
||||
await client.query(`
|
||||
UPDATE products
|
||||
SET name = $1, description = $2, price = $3,
|
||||
strain_type = $4, thc_percentage = $5, cbd_percentage = $6,
|
||||
brand = $7, weight = $8, image_url = $9, dutchie_url = $10,
|
||||
brand = $7, weight = $8, image_url = COALESCE($9, image_url), dutchie_url = $10,
|
||||
in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP,
|
||||
updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14)
|
||||
updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14),
|
||||
name_normalized = $15, brand_normalized = $16,
|
||||
external_id = COALESCE(external_id, $17), source_platform = COALESCE(source_platform, 'dutchie')
|
||||
WHERE id = $12
|
||||
`, [
|
||||
item.name, item.description, item.price,
|
||||
item.strainType, item.thcPercentage, item.cbdPercentage,
|
||||
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
|
||||
JSON.stringify(item.metadata || {}), productId, dispensaryId, slug
|
||||
JSON.stringify(item.metadata || {}), productId, dispensaryId, slug,
|
||||
nameNormalized, brandNormalized, externalId
|
||||
]);
|
||||
|
||||
logger.debug('pipeline', `Updated product: ${item.name}`);
|
||||
} else {
|
||||
// Insert new product
|
||||
isNewProduct = true;
|
||||
const insertResult = await client.query(`
|
||||
INSERT INTO products (
|
||||
store_id, category_id, dispensary_id, dutchie_product_id, slug, name, description,
|
||||
store_id, category_id, dispensary_id, dutchie_product_id, external_id,
|
||||
slug, name, name_normalized, description,
|
||||
price, strain_type, thc_percentage, cbd_percentage,
|
||||
brand, weight, image_url, dutchie_url, in_stock, metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16)
|
||||
brand, brand_normalized, weight, image_url, dutchie_url, in_stock, metadata,
|
||||
source_platform
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, true, $19, 'dutchie')
|
||||
RETURNING id
|
||||
`, [
|
||||
storeId, categoryId, dispensaryId, item.dutchieProductId, slug, item.name, item.description,
|
||||
storeId, categoryId, dispensaryId, externalId, externalId,
|
||||
slug, item.name, nameNormalized, item.description,
|
||||
item.price, item.strainType, item.thcPercentage, item.cbdPercentage,
|
||||
item.brand, item.weight, item.imageUrl, item.dutchieUrl,
|
||||
item.brand, brandNormalized, item.weight, item.imageUrl, item.dutchieUrl,
|
||||
JSON.stringify(item.metadata || {})
|
||||
]);
|
||||
|
||||
productId = insertResult.rows[0].id;
|
||||
logger.debug('pipeline', `Inserted new product: ${item.name}`);
|
||||
logger.debug('pipeline', `Inserted NEW product: ${item.name}`);
|
||||
}
|
||||
|
||||
// Download image if needed
|
||||
if (item.imageUrl && !localImagePath) {
|
||||
// ALWAYS create a snapshot for historical tracking
|
||||
await this.createSnapshot(client, {
|
||||
productId: productId!,
|
||||
dispensaryId,
|
||||
externalId,
|
||||
slug,
|
||||
item,
|
||||
categoryName
|
||||
});
|
||||
|
||||
// Download image if needed (only for new products or missing local image)
|
||||
if (item.imageUrl && !localImagePath && productId) {
|
||||
try {
|
||||
// Get store slug for organized image storage
|
||||
const storeResult = await client.query(
|
||||
'SELECT slug FROM stores WHERE id = $1',
|
||||
[storeId]
|
||||
);
|
||||
const storeSlug = storeResult.rows[0]?.slug || undefined;
|
||||
|
||||
const imageSizes = await uploadImageFromUrl(item.imageUrl, productId, storeSlug);
|
||||
// Use thumbnail path for local_image_path
|
||||
const imageSizes = await uploadImageFromUrl(item.imageUrl, productId!, storeSlug);
|
||||
localImagePath = imageSizes.thumbnail;
|
||||
await client.query(`
|
||||
UPDATE products SET local_image_path = $1 WHERE id = $2
|
||||
@@ -262,6 +353,10 @@ export class DatabasePipeline implements ItemPipeline<Product> {
|
||||
}
|
||||
}
|
||||
|
||||
// Attach metadata for stats tracking
|
||||
(item as any).isNewProduct = isNewProduct;
|
||||
(item as any).productId = productId;
|
||||
|
||||
return item;
|
||||
|
||||
} catch (error) {
|
||||
@@ -271,6 +366,78 @@ export class DatabasePipeline implements ItemPipeline<Product> {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a snapshot record for historical tracking
|
||||
*/
|
||||
private async createSnapshot(
|
||||
client: any,
|
||||
params: {
|
||||
productId: number;
|
||||
dispensaryId: number | null;
|
||||
externalId: string | null;
|
||||
slug: string;
|
||||
item: Product;
|
||||
categoryName?: string;
|
||||
}
|
||||
): Promise<void> {
|
||||
try {
|
||||
// Only create snapshots if the table exists (graceful degradation)
|
||||
const tableExists = await client.query(`
|
||||
SELECT EXISTS (
|
||||
SELECT FROM information_schema.tables
|
||||
WHERE table_name = 'product_snapshots'
|
||||
)
|
||||
`);
|
||||
|
||||
if (!tableExists.rows[0].exists) {
|
||||
return; // Snapshot table not yet created
|
||||
}
|
||||
|
||||
const crawlId = this.crawlId || crypto.randomUUID();
|
||||
const { productId, dispensaryId, externalId, slug, item, categoryName } = params;
|
||||
|
||||
await client.query(`
|
||||
INSERT INTO product_snapshots (
|
||||
crawl_id, dispensary_id, external_product_id, product_slug,
|
||||
name, brand, category, price, original_price, sale_price,
|
||||
discount_type, discount_value, availability_status, stock_quantity,
|
||||
thc_percentage, cbd_percentage, strain_type, weight, variant,
|
||||
description, image_url, effects, terpenes, captured_at
|
||||
) VALUES (
|
||||
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, NOW()
|
||||
)
|
||||
`, [
|
||||
crawlId,
|
||||
dispensaryId,
|
||||
externalId,
|
||||
slug,
|
||||
item.name,
|
||||
item.brand || null,
|
||||
categoryName || null,
|
||||
item.price || null,
|
||||
item.originalPrice || null,
|
||||
item.metadata?.salePrice || null,
|
||||
item.metadata?.discountType || null,
|
||||
item.metadata?.discountValue || null,
|
||||
'in_stock', // availability_status - if we scraped it, it's in stock
|
||||
item.metadata?.stockQuantity || null,
|
||||
item.thcPercentage || null,
|
||||
item.cbdPercentage || null,
|
||||
item.strainType || null,
|
||||
item.weight || null,
|
||||
item.metadata?.variant || null,
|
||||
item.description || null,
|
||||
item.imageUrl || null,
|
||||
item.metadata?.effects || null,
|
||||
item.metadata?.terpenes || null
|
||||
]);
|
||||
|
||||
} catch (error) {
|
||||
// Don't fail the whole pipeline if snapshot creation fails
|
||||
logger.warn('pipeline', `Failed to create snapshot for ${params.item.name}: ${error}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
439
backend/src/scrapers/dutchie-graphql-direct.ts
Normal file
439
backend/src/scrapers/dutchie-graphql-direct.ts
Normal file
@@ -0,0 +1,439 @@
|
||||
// ============================================================================
|
||||
// DEPRECATED: This scraper writes to the LEGACY products table.
|
||||
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
|
||||
//
|
||||
// New pipeline location: src/dutchie-az/services/product-crawler.ts
|
||||
// - Uses fetch-based GraphQL (no Puppeteer needed)
|
||||
// - Writes to isolated dutchie_az_* tables with snapshot model
|
||||
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This scraper writes to the legacy products table, not the new dutchie_az tables.
|
||||
*
|
||||
* Makes direct GraphQL requests from within the browser context to:
|
||||
* 1. Bypass Cloudflare (using browser session)
|
||||
* 2. Fetch ALL products including out-of-stock (Status: null)
|
||||
* 3. Paginate through complete menu
|
||||
*/
|
||||
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import type { Browser, Page } from 'puppeteer';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { Pool } from 'pg';
|
||||
import { DutchieProduct, NormalizedProduct, normalizeDutchieProduct } from './dutchie-graphql';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// GraphQL persisted query hashes
|
||||
const GRAPHQL_HASHES = {
|
||||
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
||||
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
||||
};
|
||||
|
||||
interface FetchResult {
|
||||
products: DutchieProduct[];
|
||||
dispensaryId: string;
|
||||
totalProducts: number;
|
||||
activeCount: number;
|
||||
inactiveCount: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch all products via in-page GraphQL requests
|
||||
* This includes both in-stock and out-of-stock items
|
||||
*/
|
||||
export async function fetchAllDutchieProducts(
|
||||
menuUrl: string,
|
||||
options: {
|
||||
headless?: boolean | 'new';
|
||||
timeout?: number;
|
||||
perPage?: number;
|
||||
includeOutOfStock?: boolean;
|
||||
} = {}
|
||||
): Promise<FetchResult> {
|
||||
const {
|
||||
headless = 'new',
|
||||
timeout = 90000,
|
||||
perPage = 100,
|
||||
includeOutOfStock = true,
|
||||
} = options;
|
||||
|
||||
let browser: Browser | undefined;
|
||||
|
||||
try {
|
||||
browser = await puppeteer.launch({
|
||||
headless,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Stealth configuration
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
(window as any).chrome = { runtime: {} };
|
||||
});
|
||||
|
||||
// Navigate to menu page to establish session
|
||||
console.log('[DutchieGraphQL] Loading menu page to establish session...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
|
||||
// Get dispensary ID from page
|
||||
const dispensaryId = await page.evaluate(() => {
|
||||
const env = (window as any).reactEnv;
|
||||
return env?.dispensaryId || env?.retailerId || '';
|
||||
});
|
||||
|
||||
if (!dispensaryId) {
|
||||
throw new Error('Could not determine dispensaryId from page');
|
||||
}
|
||||
|
||||
console.log(`[DutchieGraphQL] Dispensary ID: ${dispensaryId}`);
|
||||
|
||||
// Fetch all products via in-page GraphQL requests
|
||||
const allProducts: DutchieProduct[] = [];
|
||||
let page_num = 0;
|
||||
let hasMore = true;
|
||||
|
||||
while (hasMore) {
|
||||
console.log(`[DutchieGraphQL] Fetching page ${page_num} (perPage=${perPage})...`);
|
||||
|
||||
const result = await page.evaluate(
|
||||
async (dispensaryId: string, page_num: number, perPage: number, includeOutOfStock: boolean, hash: string) => {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId,
|
||||
pricingType: 'rec',
|
||||
Status: includeOutOfStock ? null : 'Active', // null = include out-of-stock
|
||||
types: [],
|
||||
useCache: false, // Don't cache to get fresh data
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page: page_num,
|
||||
perPage,
|
||||
};
|
||||
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({
|
||||
persistedQuery: { version: 1, sha256Hash: hash },
|
||||
}),
|
||||
});
|
||||
|
||||
const response = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include', // Include cookies/session
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
|
||||
return response.json();
|
||||
},
|
||||
dispensaryId,
|
||||
page_num,
|
||||
perPage,
|
||||
includeOutOfStock,
|
||||
GRAPHQL_HASHES.FilteredProducts
|
||||
);
|
||||
|
||||
if (result.errors) {
|
||||
console.error('[DutchieGraphQL] GraphQL errors:', result.errors);
|
||||
break;
|
||||
}
|
||||
|
||||
const products = result?.data?.filteredProducts?.products || [];
|
||||
console.log(`[DutchieGraphQL] Page ${page_num}: ${products.length} products`);
|
||||
|
||||
if (products.length === 0) {
|
||||
hasMore = false;
|
||||
} else {
|
||||
allProducts.push(...products);
|
||||
page_num++;
|
||||
|
||||
// Safety limit
|
||||
if (page_num > 50) {
|
||||
console.log('[DutchieGraphQL] Reached page limit, stopping');
|
||||
hasMore = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Count active vs inactive
|
||||
const activeCount = allProducts.filter((p) => p.Status === 'Active').length;
|
||||
const inactiveCount = allProducts.filter((p) => p.Status !== 'Active').length;
|
||||
|
||||
console.log(`[DutchieGraphQL] Total: ${allProducts.length} products (${activeCount} active, ${inactiveCount} inactive)`);
|
||||
|
||||
return {
|
||||
products: allProducts,
|
||||
dispensaryId,
|
||||
totalProducts: allProducts.length,
|
||||
activeCount,
|
||||
inactiveCount,
|
||||
};
|
||||
} finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Upsert products to database
|
||||
*/
|
||||
export async function upsertProductsDirect(
|
||||
pool: Pool,
|
||||
storeId: number,
|
||||
products: NormalizedProduct[]
|
||||
): Promise<{ inserted: number; updated: number }> {
|
||||
const client = await pool.connect();
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
for (const product of products) {
|
||||
const result = await client.query(
|
||||
`
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name, enterprise_product_id,
|
||||
brand, brand_external_id, brand_logo_url,
|
||||
subcategory, strain_type, canonical_category,
|
||||
price, rec_price, med_price, rec_special_price, med_special_price,
|
||||
is_on_special, special_name, discount_percent, special_data,
|
||||
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
||||
thc_percentage, cbd_percentage, cannabinoids,
|
||||
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
||||
image_url, additional_images,
|
||||
is_featured, medical_only, rec_only,
|
||||
source_created_at, source_updated_at,
|
||||
description, raw_data,
|
||||
dutchie_url, last_seen_at, updated_at
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5,
|
||||
$6, $7, $8,
|
||||
$9, $10, $11,
|
||||
$12, $13, $14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25,
|
||||
$26, $27, $28,
|
||||
$29, $30, $31, $32, $33,
|
||||
$34, $35,
|
||||
$36, $37, $38,
|
||||
$39, $40,
|
||||
$41, $42,
|
||||
'', NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
||||
brand = EXCLUDED.brand,
|
||||
brand_external_id = EXCLUDED.brand_external_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
canonical_category = EXCLUDED.canonical_category,
|
||||
price = EXCLUDED.price,
|
||||
rec_price = EXCLUDED.rec_price,
|
||||
med_price = EXCLUDED.med_price,
|
||||
rec_special_price = EXCLUDED.rec_special_price,
|
||||
med_special_price = EXCLUDED.med_special_price,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
special_name = EXCLUDED.special_name,
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
special_data = EXCLUDED.special_data,
|
||||
sku = EXCLUDED.sku,
|
||||
inventory_quantity = EXCLUDED.inventory_quantity,
|
||||
inventory_available = EXCLUDED.inventory_available,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
status = EXCLUDED.status,
|
||||
thc_percentage = EXCLUDED.thc_percentage,
|
||||
cbd_percentage = EXCLUDED.cbd_percentage,
|
||||
cannabinoids = EXCLUDED.cannabinoids,
|
||||
weight_mg = EXCLUDED.weight_mg,
|
||||
net_weight_value = EXCLUDED.net_weight_value,
|
||||
net_weight_unit = EXCLUDED.net_weight_unit,
|
||||
options = EXCLUDED.options,
|
||||
raw_options = EXCLUDED.raw_options,
|
||||
image_url = EXCLUDED.image_url,
|
||||
additional_images = EXCLUDED.additional_images,
|
||||
is_featured = EXCLUDED.is_featured,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
source_created_at = EXCLUDED.source_created_at,
|
||||
source_updated_at = EXCLUDED.source_updated_at,
|
||||
description = EXCLUDED.description,
|
||||
raw_data = EXCLUDED.raw_data,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS was_inserted
|
||||
`,
|
||||
[
|
||||
storeId,
|
||||
product.external_id,
|
||||
product.slug,
|
||||
product.name,
|
||||
product.enterprise_product_id,
|
||||
product.brand,
|
||||
product.brand_external_id,
|
||||
product.brand_logo_url,
|
||||
product.subcategory,
|
||||
product.strain_type,
|
||||
product.canonical_category,
|
||||
product.price,
|
||||
product.rec_price,
|
||||
product.med_price,
|
||||
product.rec_special_price,
|
||||
product.med_special_price,
|
||||
product.is_on_special,
|
||||
product.special_name,
|
||||
product.discount_percent,
|
||||
product.special_data ? JSON.stringify(product.special_data) : null,
|
||||
product.sku,
|
||||
product.inventory_quantity,
|
||||
product.inventory_available,
|
||||
product.is_below_threshold,
|
||||
product.status,
|
||||
product.thc_percentage,
|
||||
product.cbd_percentage,
|
||||
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
||||
product.weight_mg,
|
||||
product.net_weight_value,
|
||||
product.net_weight_unit,
|
||||
product.options,
|
||||
product.raw_options,
|
||||
product.image_url,
|
||||
product.additional_images,
|
||||
product.is_featured,
|
||||
product.medical_only,
|
||||
product.rec_only,
|
||||
product.source_created_at,
|
||||
product.source_updated_at,
|
||||
product.description,
|
||||
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
||||
]
|
||||
);
|
||||
|
||||
if (result.rows[0]?.was_inserted) {
|
||||
inserted++;
|
||||
} else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
|
||||
await client.query('COMMIT');
|
||||
return { inserted, updated };
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This function is disabled and will throw an error if called.
|
||||
* Main entry point - scrape all products including out-of-stock
|
||||
*/
|
||||
export async function scrapeAllDutchieProducts(
|
||||
pool: Pool,
|
||||
storeId: number,
|
||||
menuUrl: string
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
totalProducts: number;
|
||||
activeCount: number;
|
||||
inactiveCount: number;
|
||||
inserted: number;
|
||||
updated: number;
|
||||
error?: string;
|
||||
}> {
|
||||
// DEPRECATED: Throw error to prevent accidental use
|
||||
throw new Error(
|
||||
'DEPRECATED: scrapeAllDutchieProducts() is deprecated. ' +
|
||||
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
|
||||
'This scraper writes to the legacy products table.'
|
||||
);
|
||||
|
||||
// Original code below is unreachable but kept for reference
|
||||
try {
|
||||
console.log(`[DutchieGraphQL] Scraping ALL products (including out-of-stock): ${menuUrl}`);
|
||||
|
||||
// Fetch all products via direct GraphQL
|
||||
const { products, totalProducts, activeCount, inactiveCount } = await fetchAllDutchieProducts(menuUrl, {
|
||||
includeOutOfStock: true,
|
||||
perPage: 100,
|
||||
});
|
||||
|
||||
if (products.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
totalProducts: 0,
|
||||
activeCount: 0,
|
||||
inactiveCount: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: 'No products returned from GraphQL',
|
||||
};
|
||||
}
|
||||
|
||||
// Normalize products
|
||||
const normalized = products.map(normalizeDutchieProduct);
|
||||
|
||||
// Upsert to database
|
||||
const { inserted, updated } = await upsertProductsDirect(pool, storeId, normalized);
|
||||
|
||||
console.log(`[DutchieGraphQL] Complete: ${totalProducts} products (${activeCount} active, ${inactiveCount} inactive)`);
|
||||
console.log(`[DutchieGraphQL] Database: ${inserted} inserted, ${updated} updated`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
totalProducts,
|
||||
activeCount,
|
||||
inactiveCount,
|
||||
inserted,
|
||||
updated,
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error(`[DutchieGraphQL] Error:`, error.message);
|
||||
return {
|
||||
success: false,
|
||||
totalProducts: 0,
|
||||
activeCount: 0,
|
||||
inactiveCount: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: error.message,
|
||||
};
|
||||
}
|
||||
}
|
||||
711
backend/src/scrapers/dutchie-graphql.ts
Normal file
711
backend/src/scrapers/dutchie-graphql.ts
Normal file
@@ -0,0 +1,711 @@
|
||||
// ============================================================================
|
||||
// DEPRECATED: This scraper writes to the LEGACY products table.
|
||||
// DO NOT USE - All Dutchie crawling must use the new dutchie-az pipeline.
|
||||
//
|
||||
// New pipeline location: src/dutchie-az/services/product-crawler.ts
|
||||
// - Uses fetch-based GraphQL (no Puppeteer needed)
|
||||
// - Writes to isolated dutchie_az_* tables with snapshot model
|
||||
// - Tracks stockStatus, isPresentInFeed, missing_from_feed
|
||||
//
|
||||
// The normalizer functions in this file (normalizeDutchieProduct) may still
|
||||
// be imported for reference, but do NOT call scrapeDutchieMenu() or upsertProducts().
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This scraper writes to the legacy products table, not the new dutchie_az tables.
|
||||
*
|
||||
* Fetches product data via Puppeteer interception of Dutchie's GraphQL API.
|
||||
* This bypasses Cloudflare by using a real browser to load the menu page.
|
||||
*
|
||||
* GraphQL Operations:
|
||||
* - FilteredProducts: Returns paginated product list with full details
|
||||
* - GetAddressBasedDispensaryData: Resolves dispensary cName to dispensaryId
|
||||
*/
|
||||
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import type { Browser, Page, HTTPResponse } from 'puppeteer';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { Pool } from 'pg';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
// =====================================================
|
||||
// TYPE DEFINITIONS (from captured GraphQL schema)
|
||||
// =====================================================
|
||||
|
||||
export interface DutchieProduct {
|
||||
_id: string;
|
||||
id: string;
|
||||
Name: string;
|
||||
cName: string; // URL slug
|
||||
enterpriseProductId?: string;
|
||||
DispensaryID: string;
|
||||
|
||||
// Brand
|
||||
brand?: {
|
||||
id: string;
|
||||
name: string;
|
||||
imageUrl?: string;
|
||||
description?: string;
|
||||
};
|
||||
brandId?: string;
|
||||
brandName?: string;
|
||||
brandLogo?: string;
|
||||
|
||||
// Category
|
||||
type?: string; // e.g., "Edible", "Flower"
|
||||
subcategory?: string; // e.g., "gummies", "pre-rolls"
|
||||
strainType?: string; // "Indica", "Sativa", "Hybrid", "N/A"
|
||||
|
||||
// Pricing (arrays - first element is primary)
|
||||
Prices?: number[];
|
||||
recPrices?: number[];
|
||||
medicalPrices?: number[];
|
||||
recSpecialPrices?: number[];
|
||||
medicalSpecialPrices?: number[];
|
||||
|
||||
// Specials
|
||||
special?: boolean;
|
||||
specialData?: {
|
||||
saleSpecials?: Array<{
|
||||
specialId: string;
|
||||
specialName: string;
|
||||
discount: number;
|
||||
percentDiscount: boolean;
|
||||
dollarDiscount: boolean;
|
||||
specialType: string;
|
||||
}>;
|
||||
bogoSpecials?: any;
|
||||
};
|
||||
|
||||
// Inventory
|
||||
POSMetaData?: {
|
||||
canonicalSKU?: string;
|
||||
canonicalCategory?: string;
|
||||
canonicalName?: string;
|
||||
canonicalLabResultUrl?: string;
|
||||
children?: Array<{
|
||||
option: string;
|
||||
price: number;
|
||||
quantity: number;
|
||||
quantityAvailable: number;
|
||||
recPrice?: number;
|
||||
medPrice?: number;
|
||||
}>;
|
||||
};
|
||||
Status?: string; // "Active" or "Inactive"
|
||||
isBelowThreshold?: boolean;
|
||||
|
||||
// Potency
|
||||
THCContent?: {
|
||||
unit: string;
|
||||
range: number[];
|
||||
};
|
||||
CBDContent?: {
|
||||
unit: string;
|
||||
range: number[];
|
||||
};
|
||||
cannabinoidsV2?: Array<{
|
||||
value: number;
|
||||
unit: string;
|
||||
cannabinoid: {
|
||||
name: string;
|
||||
};
|
||||
}>;
|
||||
|
||||
// Weight/Options
|
||||
Options?: string[];
|
||||
rawOptions?: string[];
|
||||
weight?: number;
|
||||
measurements?: {
|
||||
netWeight?: {
|
||||
unit: string;
|
||||
values: number[];
|
||||
};
|
||||
volume?: any;
|
||||
};
|
||||
|
||||
// Images
|
||||
Image?: string;
|
||||
images?: string[];
|
||||
|
||||
// Flags
|
||||
featured?: boolean;
|
||||
medicalOnly?: boolean;
|
||||
recOnly?: boolean;
|
||||
|
||||
// Timestamps
|
||||
createdAt?: string;
|
||||
updatedAt?: string;
|
||||
|
||||
// Description
|
||||
description?: string;
|
||||
effects?: Record<string, any>;
|
||||
terpenes?: any[];
|
||||
}
|
||||
|
||||
// Database product row
|
||||
export interface NormalizedProduct {
|
||||
external_id: string;
|
||||
slug: string;
|
||||
name: string;
|
||||
enterprise_product_id?: string;
|
||||
|
||||
// Brand
|
||||
brand?: string;
|
||||
brand_external_id?: string;
|
||||
brand_logo_url?: string;
|
||||
|
||||
// Category
|
||||
subcategory?: string;
|
||||
strain_type?: string;
|
||||
canonical_category?: string;
|
||||
|
||||
// Pricing
|
||||
price?: number;
|
||||
rec_price?: number;
|
||||
med_price?: number;
|
||||
rec_special_price?: number;
|
||||
med_special_price?: number;
|
||||
|
||||
// Specials
|
||||
is_on_special: boolean;
|
||||
special_name?: string;
|
||||
discount_percent?: number;
|
||||
special_data?: any;
|
||||
|
||||
// Inventory
|
||||
sku?: string;
|
||||
inventory_quantity?: number;
|
||||
inventory_available?: number;
|
||||
is_below_threshold: boolean;
|
||||
status?: string;
|
||||
|
||||
// Potency
|
||||
thc_percentage?: number;
|
||||
cbd_percentage?: number;
|
||||
cannabinoids?: any;
|
||||
|
||||
// Weight/Options
|
||||
weight_mg?: number;
|
||||
net_weight_value?: number;
|
||||
net_weight_unit?: string;
|
||||
options?: string[];
|
||||
raw_options?: string[];
|
||||
|
||||
// Images
|
||||
image_url?: string;
|
||||
additional_images?: string[];
|
||||
|
||||
// Flags
|
||||
is_featured: boolean;
|
||||
medical_only: boolean;
|
||||
rec_only: boolean;
|
||||
|
||||
// Timestamps
|
||||
source_created_at?: Date;
|
||||
source_updated_at?: Date;
|
||||
|
||||
// Raw
|
||||
description?: string;
|
||||
raw_data?: any;
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// NORMALIZER: Dutchie GraphQL → DB Schema
|
||||
// =====================================================
|
||||
|
||||
export function normalizeDutchieProduct(product: DutchieProduct): NormalizedProduct {
|
||||
// Extract first special if exists
|
||||
const saleSpecial = product.specialData?.saleSpecials?.[0];
|
||||
|
||||
// Calculate inventory from POSMetaData children
|
||||
const children = product.POSMetaData?.children || [];
|
||||
const totalQuantity = children.reduce((sum, c) => sum + (c.quantity || 0), 0);
|
||||
const availableQuantity = children.reduce((sum, c) => sum + (c.quantityAvailable || 0), 0);
|
||||
|
||||
// Parse timestamps
|
||||
let sourceCreatedAt: Date | undefined;
|
||||
if (product.createdAt) {
|
||||
// createdAt is a timestamp string like "1729044510543"
|
||||
const ts = parseInt(product.createdAt, 10);
|
||||
if (!isNaN(ts)) {
|
||||
sourceCreatedAt = new Date(ts);
|
||||
}
|
||||
}
|
||||
|
||||
let sourceUpdatedAt: Date | undefined;
|
||||
if (product.updatedAt) {
|
||||
sourceUpdatedAt = new Date(product.updatedAt);
|
||||
}
|
||||
|
||||
return {
|
||||
// Identity
|
||||
external_id: product._id || product.id,
|
||||
slug: product.cName,
|
||||
name: product.Name,
|
||||
enterprise_product_id: product.enterpriseProductId,
|
||||
|
||||
// Brand
|
||||
brand: product.brandName || product.brand?.name,
|
||||
brand_external_id: product.brandId || product.brand?.id,
|
||||
brand_logo_url: product.brandLogo || product.brand?.imageUrl,
|
||||
|
||||
// Category
|
||||
subcategory: product.subcategory,
|
||||
strain_type: product.strainType,
|
||||
canonical_category: product.POSMetaData?.canonicalCategory,
|
||||
|
||||
// Pricing
|
||||
price: product.Prices?.[0],
|
||||
rec_price: product.recPrices?.[0],
|
||||
med_price: product.medicalPrices?.[0],
|
||||
rec_special_price: product.recSpecialPrices?.[0],
|
||||
med_special_price: product.medicalSpecialPrices?.[0],
|
||||
|
||||
// Specials
|
||||
is_on_special: product.special === true,
|
||||
special_name: saleSpecial?.specialName,
|
||||
discount_percent: saleSpecial?.percentDiscount ? saleSpecial.discount : undefined,
|
||||
special_data: product.specialData,
|
||||
|
||||
// Inventory
|
||||
sku: product.POSMetaData?.canonicalSKU,
|
||||
inventory_quantity: totalQuantity || undefined,
|
||||
inventory_available: availableQuantity || undefined,
|
||||
is_below_threshold: product.isBelowThreshold === true,
|
||||
status: product.Status,
|
||||
|
||||
// Potency
|
||||
thc_percentage: product.THCContent?.range?.[0],
|
||||
cbd_percentage: product.CBDContent?.range?.[0],
|
||||
cannabinoids: product.cannabinoidsV2,
|
||||
|
||||
// Weight/Options
|
||||
weight_mg: product.weight,
|
||||
net_weight_value: product.measurements?.netWeight?.values?.[0],
|
||||
net_weight_unit: product.measurements?.netWeight?.unit,
|
||||
options: product.Options,
|
||||
raw_options: product.rawOptions,
|
||||
|
||||
// Images
|
||||
image_url: product.Image,
|
||||
additional_images: product.images?.length ? product.images : undefined,
|
||||
|
||||
// Flags
|
||||
is_featured: product.featured === true,
|
||||
medical_only: product.medicalOnly === true,
|
||||
rec_only: product.recOnly === true,
|
||||
|
||||
// Timestamps
|
||||
source_created_at: sourceCreatedAt,
|
||||
source_updated_at: sourceUpdatedAt,
|
||||
|
||||
// Description
|
||||
description: typeof product.description === 'string' ? product.description : undefined,
|
||||
|
||||
// Raw
|
||||
raw_data: product,
|
||||
};
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// PUPPETEER SCRAPER
|
||||
// =====================================================
|
||||
|
||||
interface CapturedProducts {
|
||||
products: DutchieProduct[];
|
||||
dispensaryId: string;
|
||||
menuUrl: string;
|
||||
}
|
||||
|
||||
export async function fetchDutchieMenuViaPuppeteer(
|
||||
menuUrl: string,
|
||||
options: {
|
||||
headless?: boolean | 'new';
|
||||
timeout?: number;
|
||||
maxScrolls?: number;
|
||||
} = {}
|
||||
): Promise<CapturedProducts> {
|
||||
const {
|
||||
headless = 'new',
|
||||
timeout = 90000,
|
||||
maxScrolls = 30, // Increased for full menu capture
|
||||
} = options;
|
||||
|
||||
let browser: Browser | undefined;
|
||||
const capturedProducts: DutchieProduct[] = [];
|
||||
let dispensaryId = '';
|
||||
|
||||
try {
|
||||
browser = await puppeteer.launch({
|
||||
headless,
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Stealth configuration
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
(window as any).chrome = { runtime: {} };
|
||||
});
|
||||
|
||||
// Track seen product IDs to avoid duplicates
|
||||
const seenIds = new Set<string>();
|
||||
|
||||
// Intercept GraphQL responses
|
||||
page.on('response', async (response: HTTPResponse) => {
|
||||
const url = response.url();
|
||||
if (!url.includes('graphql')) return;
|
||||
|
||||
try {
|
||||
const contentType = response.headers()['content-type'] || '';
|
||||
if (!contentType.includes('application/json')) return;
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
// Capture dispensary ID
|
||||
if (data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId) {
|
||||
dispensaryId = data.data.getAddressBasedDispensaryData.dispensaryData.dispensaryId;
|
||||
}
|
||||
|
||||
// Capture products from FilteredProducts
|
||||
if (data?.data?.filteredProducts?.products) {
|
||||
const products = data.data.filteredProducts.products as DutchieProduct[];
|
||||
for (const product of products) {
|
||||
if (!seenIds.has(product._id)) {
|
||||
seenIds.add(product._id);
|
||||
capturedProducts.push(product);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Ignore parse errors
|
||||
}
|
||||
});
|
||||
|
||||
// Navigate to menu
|
||||
console.log('[DutchieGraphQL] Loading menu page...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout,
|
||||
});
|
||||
|
||||
// Get dispensary ID from window.reactEnv if not captured
|
||||
if (!dispensaryId) {
|
||||
dispensaryId = await page.evaluate(() => {
|
||||
const env = (window as any).reactEnv;
|
||||
return env?.dispensaryId || env?.retailerId || '';
|
||||
});
|
||||
}
|
||||
|
||||
// Helper function to scroll through a page until no more products load
|
||||
async function scrollToLoadAll(maxScrollAttempts: number = maxScrolls): Promise<void> {
|
||||
let scrollCount = 0;
|
||||
let previousCount = 0;
|
||||
let noNewProductsCount = 0;
|
||||
|
||||
while (scrollCount < maxScrollAttempts && noNewProductsCount < 3) {
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await new Promise((r) => setTimeout(r, 1500));
|
||||
|
||||
const currentCount = seenIds.size;
|
||||
if (currentCount === previousCount) {
|
||||
noNewProductsCount++;
|
||||
} else {
|
||||
noNewProductsCount = 0;
|
||||
}
|
||||
previousCount = currentCount;
|
||||
scrollCount++;
|
||||
}
|
||||
}
|
||||
|
||||
// First, scroll through the main page (all products)
|
||||
console.log('[DutchieGraphQL] Scrolling main page...');
|
||||
await scrollToLoadAll();
|
||||
console.log(`[DutchieGraphQL] After main page: ${seenIds.size} products`);
|
||||
|
||||
// Get category links from the navigation
|
||||
const categoryLinks = await page.evaluate(() => {
|
||||
const links: string[] = [];
|
||||
// Look for category navigation links
|
||||
const navLinks = document.querySelectorAll('a[href*="/products/"]');
|
||||
navLinks.forEach((link) => {
|
||||
const href = (link as HTMLAnchorElement).href;
|
||||
if (href && !links.includes(href)) {
|
||||
links.push(href);
|
||||
}
|
||||
});
|
||||
return links;
|
||||
});
|
||||
|
||||
console.log(`[DutchieGraphQL] Found ${categoryLinks.length} category links`);
|
||||
|
||||
// Visit each category page to capture all products
|
||||
for (const categoryUrl of categoryLinks) {
|
||||
try {
|
||||
console.log(`[DutchieGraphQL] Visiting category: ${categoryUrl.split('/').pop()}`);
|
||||
await page.goto(categoryUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 30000,
|
||||
});
|
||||
await scrollToLoadAll(15); // Fewer scrolls per category
|
||||
console.log(`[DutchieGraphQL] Total products: ${seenIds.size}`);
|
||||
} catch (e: any) {
|
||||
console.log(`[DutchieGraphQL] Category error: ${e.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for any final responses
|
||||
await new Promise((r) => setTimeout(r, 2000));
|
||||
|
||||
return {
|
||||
products: capturedProducts,
|
||||
dispensaryId,
|
||||
menuUrl,
|
||||
};
|
||||
} finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// DATABASE OPERATIONS
|
||||
// =====================================================
|
||||
|
||||
export async function upsertProducts(
|
||||
pool: Pool,
|
||||
storeId: number,
|
||||
products: NormalizedProduct[]
|
||||
): Promise<{ inserted: number; updated: number }> {
|
||||
const client = await pool.connect();
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
for (const product of products) {
|
||||
// Upsert product
|
||||
const result = await client.query(
|
||||
`
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name, enterprise_product_id,
|
||||
brand, brand_external_id, brand_logo_url,
|
||||
subcategory, strain_type, canonical_category,
|
||||
price, rec_price, med_price, rec_special_price, med_special_price,
|
||||
is_on_special, special_name, discount_percent, special_data,
|
||||
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
||||
thc_percentage, cbd_percentage, cannabinoids,
|
||||
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
||||
image_url, additional_images,
|
||||
is_featured, medical_only, rec_only,
|
||||
source_created_at, source_updated_at,
|
||||
description, raw_data,
|
||||
dutchie_url, last_seen_at, updated_at
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5,
|
||||
$6, $7, $8,
|
||||
$9, $10, $11,
|
||||
$12, $13, $14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25,
|
||||
$26, $27, $28,
|
||||
$29, $30, $31, $32, $33,
|
||||
$34, $35,
|
||||
$36, $37, $38,
|
||||
$39, $40,
|
||||
$41, $42,
|
||||
'', NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
||||
brand = EXCLUDED.brand,
|
||||
brand_external_id = EXCLUDED.brand_external_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
canonical_category = EXCLUDED.canonical_category,
|
||||
price = EXCLUDED.price,
|
||||
rec_price = EXCLUDED.rec_price,
|
||||
med_price = EXCLUDED.med_price,
|
||||
rec_special_price = EXCLUDED.rec_special_price,
|
||||
med_special_price = EXCLUDED.med_special_price,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
special_name = EXCLUDED.special_name,
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
special_data = EXCLUDED.special_data,
|
||||
sku = EXCLUDED.sku,
|
||||
inventory_quantity = EXCLUDED.inventory_quantity,
|
||||
inventory_available = EXCLUDED.inventory_available,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
status = EXCLUDED.status,
|
||||
thc_percentage = EXCLUDED.thc_percentage,
|
||||
cbd_percentage = EXCLUDED.cbd_percentage,
|
||||
cannabinoids = EXCLUDED.cannabinoids,
|
||||
weight_mg = EXCLUDED.weight_mg,
|
||||
net_weight_value = EXCLUDED.net_weight_value,
|
||||
net_weight_unit = EXCLUDED.net_weight_unit,
|
||||
options = EXCLUDED.options,
|
||||
raw_options = EXCLUDED.raw_options,
|
||||
image_url = EXCLUDED.image_url,
|
||||
additional_images = EXCLUDED.additional_images,
|
||||
is_featured = EXCLUDED.is_featured,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
source_created_at = EXCLUDED.source_created_at,
|
||||
source_updated_at = EXCLUDED.source_updated_at,
|
||||
description = EXCLUDED.description,
|
||||
raw_data = EXCLUDED.raw_data,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS was_inserted
|
||||
`,
|
||||
[
|
||||
storeId,
|
||||
product.external_id,
|
||||
product.slug,
|
||||
product.name,
|
||||
product.enterprise_product_id,
|
||||
product.brand,
|
||||
product.brand_external_id,
|
||||
product.brand_logo_url,
|
||||
product.subcategory,
|
||||
product.strain_type,
|
||||
product.canonical_category,
|
||||
product.price,
|
||||
product.rec_price,
|
||||
product.med_price,
|
||||
product.rec_special_price,
|
||||
product.med_special_price,
|
||||
product.is_on_special,
|
||||
product.special_name,
|
||||
product.discount_percent,
|
||||
product.special_data ? JSON.stringify(product.special_data) : null,
|
||||
product.sku,
|
||||
product.inventory_quantity,
|
||||
product.inventory_available,
|
||||
product.is_below_threshold,
|
||||
product.status,
|
||||
product.thc_percentage,
|
||||
product.cbd_percentage,
|
||||
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
||||
product.weight_mg,
|
||||
product.net_weight_value,
|
||||
product.net_weight_unit,
|
||||
product.options,
|
||||
product.raw_options,
|
||||
product.image_url,
|
||||
product.additional_images,
|
||||
product.is_featured,
|
||||
product.medical_only,
|
||||
product.rec_only,
|
||||
product.source_created_at,
|
||||
product.source_updated_at,
|
||||
product.description,
|
||||
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
||||
]
|
||||
);
|
||||
|
||||
if (result.rows[0]?.was_inserted) {
|
||||
inserted++;
|
||||
} else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
|
||||
await client.query('COMMIT');
|
||||
return { inserted, updated };
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
}
|
||||
|
||||
// =====================================================
|
||||
// MAIN ENTRY POINT
|
||||
// =====================================================
|
||||
|
||||
/**
|
||||
* @deprecated DEPRECATED - Use src/dutchie-az/services/product-crawler.ts instead.
|
||||
* This function is disabled and will throw an error if called.
|
||||
*/
|
||||
export async function scrapeDutchieMenu(
|
||||
pool: Pool,
|
||||
storeId: number,
|
||||
menuUrl: string
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
productsFound: number;
|
||||
inserted: number;
|
||||
updated: number;
|
||||
error?: string;
|
||||
}> {
|
||||
// DEPRECATED: Throw error to prevent accidental use
|
||||
throw new Error(
|
||||
'DEPRECATED: scrapeDutchieMenu() is deprecated. ' +
|
||||
'Use src/dutchie-az/services/product-crawler.ts instead. ' +
|
||||
'This scraper writes to the legacy products table.'
|
||||
);
|
||||
|
||||
// Original code below is unreachable but kept for reference
|
||||
try {
|
||||
console.log(`[DutchieGraphQL] Scraping: ${menuUrl}`);
|
||||
|
||||
// Fetch products via Puppeteer
|
||||
const { products, dispensaryId } = await fetchDutchieMenuViaPuppeteer(menuUrl);
|
||||
|
||||
console.log(`[DutchieGraphQL] Captured ${products.length} products, dispensaryId: ${dispensaryId}`);
|
||||
|
||||
if (products.length === 0) {
|
||||
return {
|
||||
success: false,
|
||||
productsFound: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: 'No products captured from GraphQL responses',
|
||||
};
|
||||
}
|
||||
|
||||
// Normalize products
|
||||
const normalized = products.map(normalizeDutchieProduct);
|
||||
|
||||
// Upsert to database
|
||||
const { inserted, updated } = await upsertProducts(pool, storeId, normalized);
|
||||
|
||||
console.log(`[DutchieGraphQL] Upsert complete: ${inserted} inserted, ${updated} updated`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
productsFound: products.length,
|
||||
inserted,
|
||||
updated,
|
||||
};
|
||||
} catch (error: any) {
|
||||
console.error(`[DutchieGraphQL] Error:`, error.message);
|
||||
return {
|
||||
success: false,
|
||||
productsFound: 0,
|
||||
inserted: 0,
|
||||
updated: 0,
|
||||
error: error.message,
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,9 @@
|
||||
// ============================================================================
|
||||
// DEPRECATED: Dutchie now crawled via GraphQL only (see dutchie-az pipeline)
|
||||
// DO NOT USE - This HTML scraper is unreliable and targets the legacy products table.
|
||||
// All Dutchie crawling must go through: src/dutchie-az/services/product-crawler.ts
|
||||
// ============================================================================
|
||||
|
||||
import { Page } from 'playwright';
|
||||
import { logger } from '../../services/logger';
|
||||
|
||||
@@ -9,8 +15,9 @@ export interface ScraperTemplate {
|
||||
}
|
||||
|
||||
/**
|
||||
* Dutchie marketplace scraper template
|
||||
* Used for: dutchie.com/dispensary/* URLs
|
||||
* @deprecated DEPRECATED - Dutchie HTML scraping is no longer supported.
|
||||
* Use the dutchie-az GraphQL pipeline instead: src/dutchie-az/services/product-crawler.ts
|
||||
* This template relied on unstable DOM selectors and wrote to legacy tables.
|
||||
*/
|
||||
export const dutchieTemplate: ScraperTemplate = {
|
||||
name: 'Dutchie Marketplace',
|
||||
|
||||
236
backend/src/scripts/capture-dutchie-schema.ts
Normal file
236
backend/src/scripts/capture-dutchie-schema.ts
Normal file
@@ -0,0 +1,236 @@
|
||||
/**
|
||||
* Capture Dutchie GraphQL response structure via Puppeteer interception
|
||||
* This script navigates to a Dutchie menu page and captures the GraphQL responses
|
||||
* to understand the exact product data structure
|
||||
*/
|
||||
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import * as fs from 'fs';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
interface CapturedResponse {
|
||||
operationName: string;
|
||||
url: string;
|
||||
data: any;
|
||||
timestamp: Date;
|
||||
}
|
||||
|
||||
async function captureSchema(menuUrl: string) {
|
||||
let browser;
|
||||
const capturedResponses: CapturedResponse[] = [];
|
||||
|
||||
try {
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL SCHEMA CAPTURE');
|
||||
console.log('='.repeat(80));
|
||||
console.log(`\nTarget URL: ${menuUrl}\n`);
|
||||
|
||||
browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
]
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
|
||||
// Use a realistic user agent
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||||
|
||||
// Set viewport to desktop size
|
||||
await page.setViewport({ width: 1920, height: 1080 });
|
||||
|
||||
// Hide webdriver flag
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
||||
(window as any).chrome = { runtime: {} };
|
||||
});
|
||||
|
||||
// Intercept all GraphQL responses
|
||||
page.on('response', async (response) => {
|
||||
const url = response.url();
|
||||
|
||||
// Only capture GraphQL responses
|
||||
if (!url.includes('graphql')) return;
|
||||
|
||||
try {
|
||||
const contentType = response.headers()['content-type'] || '';
|
||||
if (!contentType.includes('application/json')) return;
|
||||
|
||||
const data = await response.json();
|
||||
|
||||
// Extract operation name from URL if possible
|
||||
const urlParams = new URLSearchParams(url.split('?')[1] || '');
|
||||
const operationName = urlParams.get('operationName') || 'Unknown';
|
||||
|
||||
capturedResponses.push({
|
||||
operationName,
|
||||
url: url.substring(0, 200),
|
||||
data,
|
||||
timestamp: new Date()
|
||||
});
|
||||
|
||||
console.log(`📡 Captured: ${operationName}`);
|
||||
|
||||
// Check for product data
|
||||
if (data?.data?.filteredProducts?.products) {
|
||||
const products = data.data.filteredProducts.products;
|
||||
console.log(` Found ${products.length} products`);
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore parse errors
|
||||
}
|
||||
});
|
||||
|
||||
console.log('Navigating to page...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 90000
|
||||
});
|
||||
|
||||
// Check if it's a Dutchie menu
|
||||
const isDutchie = await page.evaluate(() => {
|
||||
return typeof (window as any).reactEnv !== 'undefined';
|
||||
});
|
||||
|
||||
if (isDutchie) {
|
||||
console.log('✅ Dutchie menu detected\n');
|
||||
|
||||
// Get environment info
|
||||
const reactEnv = await page.evaluate(() => (window as any).reactEnv);
|
||||
console.log('Dutchie Environment:');
|
||||
console.log(` dispensaryId: ${reactEnv?.dispensaryId}`);
|
||||
console.log(` retailerId: ${reactEnv?.retailerId}`);
|
||||
console.log(` chainId: ${reactEnv?.chainId}`);
|
||||
}
|
||||
|
||||
// Scroll to trigger lazy loading
|
||||
console.log('\nScrolling to load more products...');
|
||||
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
||||
await new Promise(r => setTimeout(r, 3000));
|
||||
|
||||
// Click on a category to trigger more loads
|
||||
const categoryLinks = await page.$$('a[href*="/products/"]');
|
||||
if (categoryLinks.length > 0) {
|
||||
console.log(`Found ${categoryLinks.length} category links, clicking first one...`);
|
||||
try {
|
||||
await categoryLinks[0].click();
|
||||
await page.waitForNavigation({ waitUntil: 'networkidle2', timeout: 30000 });
|
||||
} catch (e) {
|
||||
console.log('Category navigation failed, continuing...');
|
||||
}
|
||||
}
|
||||
|
||||
// Wait a bit more for any final responses
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
console.log(`\n${'='.repeat(80)}`);
|
||||
console.log(`CAPTURED ${capturedResponses.length} GRAPHQL RESPONSES`);
|
||||
console.log('='.repeat(80));
|
||||
|
||||
// Find product data
|
||||
let productSchema: any = null;
|
||||
let sampleProduct: any = null;
|
||||
|
||||
for (const resp of capturedResponses) {
|
||||
console.log(`\n${resp.operationName}:`);
|
||||
console.log(` URL: ${resp.url.substring(0, 100)}...`);
|
||||
|
||||
if (resp.data?.data?.filteredProducts?.products) {
|
||||
const products = resp.data.data.filteredProducts.products;
|
||||
console.log(` ✅ Contains ${products.length} products`);
|
||||
|
||||
if (products.length > 0 && !sampleProduct) {
|
||||
sampleProduct = products[0];
|
||||
productSchema = extractSchema(products[0]);
|
||||
}
|
||||
}
|
||||
|
||||
// Show top-level data keys
|
||||
if (resp.data?.data) {
|
||||
console.log(` Data keys: ${Object.keys(resp.data.data).join(', ')}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Output the product schema
|
||||
if (productSchema) {
|
||||
console.log('\n' + '='.repeat(80));
|
||||
console.log('PRODUCT SCHEMA (from first product):');
|
||||
console.log('='.repeat(80));
|
||||
console.log(JSON.stringify(productSchema, null, 2));
|
||||
|
||||
console.log('\n' + '='.repeat(80));
|
||||
console.log('SAMPLE PRODUCT:');
|
||||
console.log('='.repeat(80));
|
||||
console.log(JSON.stringify(sampleProduct, null, 2));
|
||||
|
||||
// Save to file
|
||||
const outputData = {
|
||||
capturedAt: new Date().toISOString(),
|
||||
menuUrl,
|
||||
schema: productSchema,
|
||||
sampleProduct,
|
||||
allResponses: capturedResponses.map(r => ({
|
||||
operationName: r.operationName,
|
||||
dataKeys: r.data?.data ? Object.keys(r.data.data) : [],
|
||||
productCount: r.data?.data?.filteredProducts?.products?.length || 0
|
||||
}))
|
||||
};
|
||||
|
||||
const outputPath = '/tmp/dutchie-schema-capture.json';
|
||||
fs.writeFileSync(outputPath, JSON.stringify(outputData, null, 2));
|
||||
console.log(`\nSaved capture to: ${outputPath}`);
|
||||
} else {
|
||||
console.log('\n❌ No product data captured');
|
||||
|
||||
// Debug: show all responses
|
||||
console.log('\nAll captured responses:');
|
||||
for (const resp of capturedResponses) {
|
||||
console.log(`\n${resp.operationName}:`);
|
||||
console.log(JSON.stringify(resp.data, null, 2).substring(0, 500));
|
||||
}
|
||||
}
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('Error:', error.message);
|
||||
} finally {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract schema from an object (field names + types)
|
||||
*/
|
||||
function extractSchema(obj: any, prefix = ''): any {
|
||||
if (obj === null) return { type: 'null' };
|
||||
if (obj === undefined) return { type: 'undefined' };
|
||||
|
||||
if (Array.isArray(obj)) {
|
||||
if (obj.length === 0) return { type: 'array', items: 'unknown' };
|
||||
return {
|
||||
type: 'array',
|
||||
items: extractSchema(obj[0], prefix + '[]')
|
||||
};
|
||||
}
|
||||
|
||||
if (typeof obj === 'object') {
|
||||
const schema: any = { type: 'object', properties: {} };
|
||||
for (const [key, value] of Object.entries(obj)) {
|
||||
schema.properties[key] = extractSchema(value, prefix ? `${prefix}.${key}` : key);
|
||||
}
|
||||
return schema;
|
||||
}
|
||||
|
||||
return { type: typeof obj, example: String(obj).substring(0, 100) };
|
||||
}
|
||||
|
||||
// Run
|
||||
const url = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
||||
captureSchema(url).catch(console.error);
|
||||
66
backend/src/scripts/crawl-all-dutchie.ts
Normal file
66
backend/src/scripts/crawl-all-dutchie.ts
Normal file
@@ -0,0 +1,66 @@
|
||||
/**
|
||||
* Seed crawl: trigger dutchie crawls for all dispensaries with menu_type='dutchie'
|
||||
* and a resolved platform_dispensary_id. This uses the AZ orchestrator endpoint logic.
|
||||
*
|
||||
* Usage (local):
|
||||
* node dist/scripts/crawl-all-dutchie.js
|
||||
*
|
||||
* Requires:
|
||||
* - DATABASE_URL/CRAWLSY_DATABASE_URL pointing to the consolidated DB
|
||||
* - Dispensaries table populated with menu_type and platform_dispensary_id
|
||||
*/
|
||||
|
||||
import { query } from '../dutchie-az/db/connection';
|
||||
import { runDispensaryOrchestrator } from '../services/dispensary-orchestrator';
|
||||
|
||||
async function main() {
|
||||
const { rows } = await query<{
|
||||
id: number;
|
||||
name: string;
|
||||
slug: string;
|
||||
platform_dispensary_id: string | null;
|
||||
}>(`
|
||||
SELECT id, name, slug, platform_dispensary_id
|
||||
FROM dispensaries
|
||||
WHERE menu_type = 'dutchie'
|
||||
AND platform_dispensary_id IS NOT NULL
|
||||
ORDER BY id
|
||||
`);
|
||||
|
||||
if (!rows.length) {
|
||||
console.log('No dutchie dispensaries with resolved platform_dispensary_id found.');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
console.log(`Found ${rows.length} dutchie dispensaries with resolved IDs. Triggering crawls...`);
|
||||
|
||||
let success = 0;
|
||||
let failed = 0;
|
||||
|
||||
for (const row of rows) {
|
||||
try {
|
||||
console.log(`Crawling ${row.id} (${row.name})...`);
|
||||
const result = await runDispensaryOrchestrator(row.id);
|
||||
const ok =
|
||||
result.status === 'success' ||
|
||||
result.status === 'sandbox_only' ||
|
||||
result.status === 'detection_only';
|
||||
if (ok) {
|
||||
success++;
|
||||
} else {
|
||||
failed++;
|
||||
console.warn(`Crawl returned status ${result.status} for ${row.id} (${row.name})`);
|
||||
}
|
||||
} catch (err: any) {
|
||||
failed++;
|
||||
console.error(`Failed crawl for ${row.id} (${row.name}): ${err.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`Completed. Success: ${success}, Failed: ${failed}`);
|
||||
}
|
||||
|
||||
main().catch((err) => {
|
||||
console.error('Fatal:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
139
backend/src/scripts/run-dutchie-scrape.ts
Normal file
139
backend/src/scripts/run-dutchie-scrape.ts
Normal file
@@ -0,0 +1,139 @@
|
||||
/**
|
||||
* Run Dutchie GraphQL Scrape
|
||||
*
|
||||
* This script demonstrates the full pipeline:
|
||||
* 1. Puppeteer navigates to Dutchie menu
|
||||
* 2. GraphQL responses are intercepted
|
||||
* 3. Products are normalized to our schema
|
||||
* 4. Products are upserted to database
|
||||
* 5. Derived views (brands, categories, specials) are automatically updated
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import { scrapeDutchieMenu } from '../scrapers/dutchie-graphql';
|
||||
|
||||
const DATABASE_URL = process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
|
||||
async function main() {
|
||||
const pool = new Pool({ connectionString: DATABASE_URL });
|
||||
|
||||
try {
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL SCRAPER - FULL PIPELINE TEST');
|
||||
console.log('='.repeat(80));
|
||||
console.log(`Database: ${DATABASE_URL.replace(/:[^:@]+@/, ':***@')}`);
|
||||
|
||||
// Configuration
|
||||
const storeId = 1; // Deeply Rooted
|
||||
const menuUrl = 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
||||
|
||||
console.log(`\nStore ID: ${storeId}`);
|
||||
console.log(`Menu URL: ${menuUrl}`);
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
|
||||
// Run the scrape
|
||||
console.log('\n🚀 Starting scrape...\n');
|
||||
const result = await scrapeDutchieMenu(pool, storeId, menuUrl);
|
||||
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
console.log('📊 SCRAPE RESULTS:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(` Success: ${result.success}`);
|
||||
console.log(` Products Found: ${result.productsFound}`);
|
||||
console.log(` Inserted: ${result.inserted}`);
|
||||
console.log(` Updated: ${result.updated}`);
|
||||
if (result.error) {
|
||||
console.log(` Error: ${result.error}`);
|
||||
}
|
||||
|
||||
// Query derived views to show the result
|
||||
if (result.success) {
|
||||
console.log('\n' + '-'.repeat(80));
|
||||
console.log('📈 DERIVED DATA (from products table):');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
// Brands
|
||||
const brandsResult = await pool.query(`
|
||||
SELECT brand_name, product_count, min_price, max_price
|
||||
FROM derived_brands
|
||||
WHERE store_id = $1
|
||||
ORDER BY product_count DESC
|
||||
LIMIT 5
|
||||
`, [storeId]);
|
||||
|
||||
console.log('\nTop 5 Brands:');
|
||||
brandsResult.rows.forEach(row => {
|
||||
console.log(` - ${row.brand_name}: ${row.product_count} products ($${row.min_price} - $${row.max_price})`);
|
||||
});
|
||||
|
||||
// Specials
|
||||
const specialsResult = await pool.query(`
|
||||
SELECT name, brand, rec_price, rec_special_price, discount_percent
|
||||
FROM current_specials
|
||||
WHERE store_id = $1
|
||||
LIMIT 5
|
||||
`, [storeId]);
|
||||
|
||||
console.log('\nTop 5 Specials:');
|
||||
if (specialsResult.rows.length === 0) {
|
||||
console.log(' (No specials found - is_on_special may not be populated yet)');
|
||||
} else {
|
||||
specialsResult.rows.forEach(row => {
|
||||
console.log(` - ${row.name} (${row.brand}): $${row.rec_price} → $${row.rec_special_price} (${row.discount_percent}% off)`);
|
||||
});
|
||||
}
|
||||
|
||||
// Categories
|
||||
const categoriesResult = await pool.query(`
|
||||
SELECT category_name, product_count
|
||||
FROM derived_categories
|
||||
WHERE store_id = $1
|
||||
ORDER BY product_count DESC
|
||||
LIMIT 5
|
||||
`, [storeId]);
|
||||
|
||||
console.log('\nTop 5 Categories:');
|
||||
if (categoriesResult.rows.length === 0) {
|
||||
console.log(' (No categories found - subcategory may not be populated yet)');
|
||||
} else {
|
||||
categoriesResult.rows.forEach(row => {
|
||||
console.log(` - ${row.category_name}: ${row.product_count} products`);
|
||||
});
|
||||
}
|
||||
|
||||
// Sample product
|
||||
const sampleResult = await pool.query(`
|
||||
SELECT name, brand, subcategory, rec_price, rec_special_price, is_on_special, thc_percentage, status
|
||||
FROM products
|
||||
WHERE store_id = $1 AND subcategory IS NOT NULL
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 1
|
||||
`, [storeId]);
|
||||
|
||||
if (sampleResult.rows.length > 0) {
|
||||
const sample = sampleResult.rows[0];
|
||||
console.log('\nSample Product (with new fields):');
|
||||
console.log(` Name: ${sample.name}`);
|
||||
console.log(` Brand: ${sample.brand}`);
|
||||
console.log(` Category: ${sample.subcategory}`);
|
||||
console.log(` Price: $${sample.rec_price}`);
|
||||
console.log(` Sale Price: ${sample.rec_special_price ? `$${sample.rec_special_price}` : 'N/A'}`);
|
||||
console.log(` On Special: ${sample.is_on_special}`);
|
||||
console.log(` THC: ${sample.thc_percentage}%`);
|
||||
console.log(` Status: ${sample.status}`);
|
||||
}
|
||||
}
|
||||
|
||||
console.log('\n' + '='.repeat(80));
|
||||
console.log('✅ SCRAPE COMPLETE');
|
||||
console.log('='.repeat(80));
|
||||
|
||||
} catch (error: any) {
|
||||
console.error('\n❌ Error:', error.message);
|
||||
throw error;
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
319
backend/src/scripts/scrape-all-active.ts
Normal file
319
backend/src/scripts/scrape-all-active.ts
Normal file
@@ -0,0 +1,319 @@
|
||||
/**
|
||||
* Scrape ALL active products via direct GraphQL pagination
|
||||
* This is more reliable than category navigation
|
||||
*/
|
||||
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
import { Pool } from 'pg';
|
||||
import { normalizeDutchieProduct, DutchieProduct } from '../scrapers/dutchie-graphql';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const DATABASE_URL =
|
||||
process.env.DATABASE_URL || 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus';
|
||||
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
||||
|
||||
async function scrapeAllProducts(menuUrl: string, storeId: number) {
|
||||
const pool = new Pool({ connectionString: DATABASE_URL });
|
||||
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
|
||||
try {
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
console.log('Loading menu to establish session...');
|
||||
await page.goto(menuUrl, {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
const dispensaryId = await page.evaluate(() => (window as any).reactEnv?.dispensaryId);
|
||||
console.log('Dispensary ID:', dispensaryId);
|
||||
|
||||
// Paginate through all products
|
||||
const allProducts: DutchieProduct[] = [];
|
||||
let pageNum = 0;
|
||||
const perPage = 100;
|
||||
|
||||
console.log('\nFetching all products via paginated GraphQL...');
|
||||
|
||||
while (true) {
|
||||
const result = await page.evaluate(
|
||||
async (dispId: string, hash: string, page: number, perPage: number) => {
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId: dispId,
|
||||
pricingType: 'rec',
|
||||
Status: 'Active',
|
||||
types: [],
|
||||
useCache: false,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
},
|
||||
page,
|
||||
perPage,
|
||||
};
|
||||
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
|
||||
});
|
||||
|
||||
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include',
|
||||
});
|
||||
|
||||
const json = await resp.json();
|
||||
return {
|
||||
products: json?.data?.filteredProducts?.products || [],
|
||||
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
|
||||
};
|
||||
},
|
||||
dispensaryId,
|
||||
GRAPHQL_HASH,
|
||||
pageNum,
|
||||
perPage
|
||||
);
|
||||
|
||||
if (result.products.length === 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
allProducts.push(...result.products);
|
||||
console.log(
|
||||
`Page ${pageNum}: ${result.products.length} products (total so far: ${allProducts.length}/${result.totalCount})`
|
||||
);
|
||||
|
||||
pageNum++;
|
||||
|
||||
// Safety limit
|
||||
if (pageNum > 50) {
|
||||
console.log('Reached page limit');
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\nTotal products fetched: ${allProducts.length}`);
|
||||
|
||||
// Normalize and upsert
|
||||
console.log('\nNormalizing and upserting to database...');
|
||||
const normalized = allProducts.map(normalizeDutchieProduct);
|
||||
|
||||
const client = await pool.connect();
|
||||
let inserted = 0;
|
||||
let updated = 0;
|
||||
|
||||
try {
|
||||
await client.query('BEGIN');
|
||||
|
||||
for (const product of normalized) {
|
||||
const result = await client.query(
|
||||
`
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name, enterprise_product_id,
|
||||
brand, brand_external_id, brand_logo_url,
|
||||
subcategory, strain_type, canonical_category,
|
||||
price, rec_price, med_price, rec_special_price, med_special_price,
|
||||
is_on_special, special_name, discount_percent, special_data,
|
||||
sku, inventory_quantity, inventory_available, is_below_threshold, status,
|
||||
thc_percentage, cbd_percentage, cannabinoids,
|
||||
weight_mg, net_weight_value, net_weight_unit, options, raw_options,
|
||||
image_url, additional_images,
|
||||
is_featured, medical_only, rec_only,
|
||||
source_created_at, source_updated_at,
|
||||
description, raw_data,
|
||||
dutchie_url, last_seen_at, updated_at
|
||||
)
|
||||
VALUES (
|
||||
$1, $2, $3, $4, $5,
|
||||
$6, $7, $8,
|
||||
$9, $10, $11,
|
||||
$12, $13, $14, $15, $16,
|
||||
$17, $18, $19, $20,
|
||||
$21, $22, $23, $24, $25,
|
||||
$26, $27, $28,
|
||||
$29, $30, $31, $32, $33,
|
||||
$34, $35,
|
||||
$36, $37, $38,
|
||||
$39, $40,
|
||||
$41, $42,
|
||||
'', NOW(), NOW()
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET
|
||||
name = EXCLUDED.name,
|
||||
enterprise_product_id = EXCLUDED.enterprise_product_id,
|
||||
brand = EXCLUDED.brand,
|
||||
brand_external_id = EXCLUDED.brand_external_id,
|
||||
brand_logo_url = EXCLUDED.brand_logo_url,
|
||||
subcategory = EXCLUDED.subcategory,
|
||||
strain_type = EXCLUDED.strain_type,
|
||||
canonical_category = EXCLUDED.canonical_category,
|
||||
price = EXCLUDED.price,
|
||||
rec_price = EXCLUDED.rec_price,
|
||||
med_price = EXCLUDED.med_price,
|
||||
rec_special_price = EXCLUDED.rec_special_price,
|
||||
med_special_price = EXCLUDED.med_special_price,
|
||||
is_on_special = EXCLUDED.is_on_special,
|
||||
special_name = EXCLUDED.special_name,
|
||||
discount_percent = EXCLUDED.discount_percent,
|
||||
special_data = EXCLUDED.special_data,
|
||||
sku = EXCLUDED.sku,
|
||||
inventory_quantity = EXCLUDED.inventory_quantity,
|
||||
inventory_available = EXCLUDED.inventory_available,
|
||||
is_below_threshold = EXCLUDED.is_below_threshold,
|
||||
status = EXCLUDED.status,
|
||||
thc_percentage = EXCLUDED.thc_percentage,
|
||||
cbd_percentage = EXCLUDED.cbd_percentage,
|
||||
cannabinoids = EXCLUDED.cannabinoids,
|
||||
weight_mg = EXCLUDED.weight_mg,
|
||||
net_weight_value = EXCLUDED.net_weight_value,
|
||||
net_weight_unit = EXCLUDED.net_weight_unit,
|
||||
options = EXCLUDED.options,
|
||||
raw_options = EXCLUDED.raw_options,
|
||||
image_url = EXCLUDED.image_url,
|
||||
additional_images = EXCLUDED.additional_images,
|
||||
is_featured = EXCLUDED.is_featured,
|
||||
medical_only = EXCLUDED.medical_only,
|
||||
rec_only = EXCLUDED.rec_only,
|
||||
source_created_at = EXCLUDED.source_created_at,
|
||||
source_updated_at = EXCLUDED.source_updated_at,
|
||||
description = EXCLUDED.description,
|
||||
raw_data = EXCLUDED.raw_data,
|
||||
last_seen_at = NOW(),
|
||||
updated_at = NOW()
|
||||
RETURNING (xmax = 0) AS was_inserted
|
||||
`,
|
||||
[
|
||||
storeId,
|
||||
product.external_id,
|
||||
product.slug,
|
||||
product.name,
|
||||
product.enterprise_product_id,
|
||||
product.brand,
|
||||
product.brand_external_id,
|
||||
product.brand_logo_url,
|
||||
product.subcategory,
|
||||
product.strain_type,
|
||||
product.canonical_category,
|
||||
product.price,
|
||||
product.rec_price,
|
||||
product.med_price,
|
||||
product.rec_special_price,
|
||||
product.med_special_price,
|
||||
product.is_on_special,
|
||||
product.special_name,
|
||||
product.discount_percent,
|
||||
product.special_data ? JSON.stringify(product.special_data) : null,
|
||||
product.sku,
|
||||
product.inventory_quantity,
|
||||
product.inventory_available,
|
||||
product.is_below_threshold,
|
||||
product.status,
|
||||
product.thc_percentage,
|
||||
product.cbd_percentage,
|
||||
product.cannabinoids ? JSON.stringify(product.cannabinoids) : null,
|
||||
product.weight_mg,
|
||||
product.net_weight_value,
|
||||
product.net_weight_unit,
|
||||
product.options,
|
||||
product.raw_options,
|
||||
product.image_url,
|
||||
product.additional_images,
|
||||
product.is_featured,
|
||||
product.medical_only,
|
||||
product.rec_only,
|
||||
product.source_created_at,
|
||||
product.source_updated_at,
|
||||
product.description,
|
||||
product.raw_data ? JSON.stringify(product.raw_data) : null,
|
||||
]
|
||||
);
|
||||
|
||||
if (result.rows[0]?.was_inserted) {
|
||||
inserted++;
|
||||
} else {
|
||||
updated++;
|
||||
}
|
||||
}
|
||||
|
||||
await client.query('COMMIT');
|
||||
} catch (error) {
|
||||
await client.query('ROLLBACK');
|
||||
throw error;
|
||||
} finally {
|
||||
client.release();
|
||||
}
|
||||
|
||||
console.log(`\nDatabase: ${inserted} inserted, ${updated} updated`);
|
||||
|
||||
// Show summary stats
|
||||
const stats = await pool.query(
|
||||
`
|
||||
SELECT
|
||||
COUNT(*) as total,
|
||||
COUNT(*) FILTER (WHERE is_on_special) as specials,
|
||||
COUNT(DISTINCT brand) as brands,
|
||||
COUNT(DISTINCT subcategory) as categories
|
||||
FROM products WHERE store_id = $1
|
||||
`,
|
||||
[storeId]
|
||||
);
|
||||
|
||||
console.log('\nStore summary:');
|
||||
console.log(` Total products: ${stats.rows[0].total}`);
|
||||
console.log(` On special: ${stats.rows[0].specials}`);
|
||||
console.log(` Unique brands: ${stats.rows[0].brands}`);
|
||||
console.log(` Categories: ${stats.rows[0].categories}`);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
totalProducts: allProducts.length,
|
||||
inserted,
|
||||
updated,
|
||||
};
|
||||
} finally {
|
||||
await browser.close();
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
// Run
|
||||
const menuUrl = process.argv[2] || 'https://dutchie.com/embedded-menu/AZ-Deeply-Rooted';
|
||||
const storeId = parseInt(process.argv[3] || '1', 10);
|
||||
|
||||
console.log('='.repeat(60));
|
||||
console.log('DUTCHIE GRAPHQL FULL SCRAPE');
|
||||
console.log('='.repeat(60));
|
||||
console.log(`Menu URL: ${menuUrl}`);
|
||||
console.log(`Store ID: ${storeId}`);
|
||||
console.log('');
|
||||
|
||||
scrapeAllProducts(menuUrl, storeId)
|
||||
.then((result) => {
|
||||
console.log('\n' + '='.repeat(60));
|
||||
console.log('COMPLETE');
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('Error:', error.message);
|
||||
process.exit(1);
|
||||
});
|
||||
156
backend/src/scripts/test-dutchie-e2e.ts
Normal file
156
backend/src/scripts/test-dutchie-e2e.ts
Normal file
@@ -0,0 +1,156 @@
|
||||
/**
|
||||
* Test script: End-to-end Dutchie GraphQL → DB → Dashboard flow
|
||||
*
|
||||
* This demonstrates the complete data pipeline:
|
||||
* 1. Fetch one product from Dutchie GraphQL via Puppeteer
|
||||
* 2. Normalize it to our schema
|
||||
* 3. Show the mapping
|
||||
*/
|
||||
|
||||
import { normalizeDutchieProduct, DutchieProduct, NormalizedProduct } from '../scrapers/dutchie-graphql';
|
||||
import * as fs from 'fs';
|
||||
|
||||
// Load the captured sample product from schema capture
|
||||
const capturedData = JSON.parse(
|
||||
fs.readFileSync('/tmp/dutchie-schema-capture.json', 'utf-8')
|
||||
);
|
||||
|
||||
const sampleProduct: DutchieProduct = capturedData.sampleProduct;
|
||||
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL → DATABASE MAPPING DEMONSTRATION');
|
||||
console.log('='.repeat(80));
|
||||
|
||||
console.log('\n📥 RAW DUTCHIE GRAPHQL PRODUCT:');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
// Show key fields from raw product
|
||||
const keyRawFields = {
|
||||
'_id': sampleProduct._id,
|
||||
'Name': sampleProduct.Name,
|
||||
'cName': sampleProduct.cName,
|
||||
'brandName': sampleProduct.brandName,
|
||||
'brand.id': sampleProduct.brand?.id,
|
||||
'type': sampleProduct.type,
|
||||
'subcategory': sampleProduct.subcategory,
|
||||
'strainType': sampleProduct.strainType,
|
||||
'Prices': sampleProduct.Prices,
|
||||
'recPrices': sampleProduct.recPrices,
|
||||
'recSpecialPrices': sampleProduct.recSpecialPrices,
|
||||
'special': sampleProduct.special,
|
||||
'specialData.saleSpecials[0].specialName': sampleProduct.specialData?.saleSpecials?.[0]?.specialName,
|
||||
'specialData.saleSpecials[0].discount': sampleProduct.specialData?.saleSpecials?.[0]?.discount,
|
||||
'THCContent.range[0]': sampleProduct.THCContent?.range?.[0],
|
||||
'CBDContent.range[0]': sampleProduct.CBDContent?.range?.[0],
|
||||
'Status': sampleProduct.Status,
|
||||
'Image': sampleProduct.Image,
|
||||
'POSMetaData.canonicalSKU': sampleProduct.POSMetaData?.canonicalSKU,
|
||||
'POSMetaData.children[0].quantity': sampleProduct.POSMetaData?.children?.[0]?.quantity,
|
||||
'POSMetaData.children[0].quantityAvailable': sampleProduct.POSMetaData?.children?.[0]?.quantityAvailable,
|
||||
};
|
||||
|
||||
Object.entries(keyRawFields).forEach(([key, value]) => {
|
||||
console.log(` ${key}: ${JSON.stringify(value)}`);
|
||||
});
|
||||
|
||||
console.log('\n📤 NORMALIZED DATABASE ROW:');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
// Normalize the product
|
||||
const normalized: NormalizedProduct = normalizeDutchieProduct(sampleProduct);
|
||||
|
||||
// Show the normalized result (excluding raw_data for readability)
|
||||
const { raw_data, cannabinoids, special_data, ...displayFields } = normalized;
|
||||
|
||||
Object.entries(displayFields).forEach(([key, value]) => {
|
||||
if (value !== undefined && value !== null) {
|
||||
console.log(` ${key}: ${JSON.stringify(value)}`);
|
||||
}
|
||||
});
|
||||
|
||||
console.log('\n🔗 FIELD MAPPING:');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
const fieldMappings = [
|
||||
['_id / id', 'external_id', sampleProduct._id, normalized.external_id],
|
||||
['Name', 'name', sampleProduct.Name, normalized.name],
|
||||
['cName', 'slug', sampleProduct.cName, normalized.slug],
|
||||
['brandName', 'brand', sampleProduct.brandName, normalized.brand],
|
||||
['brand.id', 'brand_external_id', sampleProduct.brand?.id, normalized.brand_external_id],
|
||||
['subcategory', 'subcategory', sampleProduct.subcategory, normalized.subcategory],
|
||||
['strainType', 'strain_type', sampleProduct.strainType, normalized.strain_type],
|
||||
['recPrices[0]', 'rec_price', sampleProduct.recPrices?.[0], normalized.rec_price],
|
||||
['recSpecialPrices[0]', 'rec_special_price', sampleProduct.recSpecialPrices?.[0], normalized.rec_special_price],
|
||||
['special', 'is_on_special', sampleProduct.special, normalized.is_on_special],
|
||||
['specialData...specialName', 'special_name', sampleProduct.specialData?.saleSpecials?.[0]?.specialName?.substring(0, 40) + '...', normalized.special_name?.substring(0, 40) + '...'],
|
||||
['THCContent.range[0]', 'thc_percentage', sampleProduct.THCContent?.range?.[0], normalized.thc_percentage],
|
||||
['CBDContent.range[0]', 'cbd_percentage', sampleProduct.CBDContent?.range?.[0], normalized.cbd_percentage],
|
||||
['Status', 'status', sampleProduct.Status, normalized.status],
|
||||
['Image', 'image_url', sampleProduct.Image?.substring(0, 50) + '...', normalized.image_url?.substring(0, 50) + '...'],
|
||||
['POSMetaData.canonicalSKU', 'sku', sampleProduct.POSMetaData?.canonicalSKU, normalized.sku],
|
||||
];
|
||||
|
||||
console.log(' GraphQL Field → DB Column | Value');
|
||||
console.log(' ' + '-'.repeat(75));
|
||||
|
||||
fieldMappings.forEach(([gqlField, dbCol, gqlVal, dbVal]) => {
|
||||
const gqlStr = String(gqlField).padEnd(30);
|
||||
const dbStr = String(dbCol).padEnd(20);
|
||||
console.log(` ${gqlStr} → ${dbStr} | ${JSON.stringify(dbVal)}`);
|
||||
});
|
||||
|
||||
console.log('\n📊 SQL INSERT STATEMENT:');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
// Generate example SQL
|
||||
const sqlExample = `
|
||||
INSERT INTO products (
|
||||
store_id, external_id, slug, name,
|
||||
brand, brand_external_id,
|
||||
subcategory, strain_type,
|
||||
rec_price, rec_special_price,
|
||||
is_on_special, special_name, discount_percent,
|
||||
thc_percentage, cbd_percentage,
|
||||
status, image_url, sku
|
||||
) VALUES (
|
||||
1, -- store_id (Deeply Rooted)
|
||||
'${normalized.external_id}', -- external_id
|
||||
'${normalized.slug}', -- slug
|
||||
'${normalized.name}', -- name
|
||||
'${normalized.brand}', -- brand
|
||||
'${normalized.brand_external_id}', -- brand_external_id
|
||||
'${normalized.subcategory}', -- subcategory
|
||||
'${normalized.strain_type}', -- strain_type
|
||||
${normalized.rec_price}, -- rec_price
|
||||
${normalized.rec_special_price}, -- rec_special_price
|
||||
${normalized.is_on_special}, -- is_on_special
|
||||
'${normalized.special_name?.substring(0, 50)}...', -- special_name
|
||||
${normalized.discount_percent || 'NULL'}, -- discount_percent
|
||||
${normalized.thc_percentage}, -- thc_percentage
|
||||
${normalized.cbd_percentage}, -- cbd_percentage
|
||||
'${normalized.status}', -- status
|
||||
'${normalized.image_url}', -- image_url
|
||||
'${normalized.sku}' -- sku
|
||||
)
|
||||
ON CONFLICT (store_id, slug) DO UPDATE SET ...;
|
||||
`;
|
||||
|
||||
console.log(sqlExample);
|
||||
|
||||
console.log('\n✅ SUMMARY:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(` Product: ${normalized.name}`);
|
||||
console.log(` Brand: ${normalized.brand}`);
|
||||
console.log(` Category: ${normalized.subcategory}`);
|
||||
console.log(` Price: $${normalized.rec_price} → $${normalized.rec_special_price} (${normalized.discount_percent}% off)`);
|
||||
console.log(` THC: ${normalized.thc_percentage}%`);
|
||||
console.log(` Status: ${normalized.status}`);
|
||||
console.log(` On Special: ${normalized.is_on_special}`);
|
||||
console.log(` SKU: ${normalized.sku}`);
|
||||
|
||||
console.log('\n🎯 DERIVED VIEWS (computed from products table):');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(' - current_specials: Products where is_on_special = true');
|
||||
console.log(' - derived_brands: Aggregated by brand name with counts/prices');
|
||||
console.log(' - derived_categories: Aggregated by subcategory');
|
||||
console.log('\nAll views are computed from the single products table - no separate tables needed!');
|
||||
233
backend/src/scripts/test-dutchie-graphql.ts
Normal file
233
backend/src/scripts/test-dutchie-graphql.ts
Normal file
@@ -0,0 +1,233 @@
|
||||
/**
|
||||
* Test script to validate Dutchie GraphQL API access and capture response structure
|
||||
*/
|
||||
|
||||
// @ts-ignore - node-fetch type declaration not installed
|
||||
import fetch from 'node-fetch';
|
||||
|
||||
const GRAPHQL_HASHES = {
|
||||
ConsumerDispensaries: '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b',
|
||||
GetAddressBasedDispensaryData: '13461f73abf7268770dfd05fe7e10c523084b2bb916a929c08efe3d87531977b',
|
||||
FilteredProducts: 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0',
|
||||
MenuFiltersV2: '2f0b3233b8a2426b391649ca3f0f7a5d43b9aefd683f6286d7261a2517e3568e',
|
||||
FilteredSpecials: '0dfb85a4fc138c55a076d4d11bf6d1a25f7cbd511428e1cf5a5b863b3eb23f25',
|
||||
};
|
||||
|
||||
interface DutchieProduct {
|
||||
id: string;
|
||||
name: string;
|
||||
slug?: string;
|
||||
brand?: string;
|
||||
brandId?: string;
|
||||
type?: string;
|
||||
category?: string;
|
||||
subcategory?: string;
|
||||
description?: string;
|
||||
image?: string;
|
||||
images?: string[];
|
||||
THCContent?: any;
|
||||
CBDContent?: any;
|
||||
terpenes?: any[];
|
||||
effects?: string[];
|
||||
strainType?: string;
|
||||
weight?: string;
|
||||
options?: any[];
|
||||
pricing?: any;
|
||||
specialPricing?: any;
|
||||
potencyThc?: any;
|
||||
potencyCbd?: any;
|
||||
labResults?: any;
|
||||
[key: string]: any; // Catch-all for additional fields
|
||||
}
|
||||
|
||||
async function fetchProducts(dispensaryId: string, page = 0, perPage = 25): Promise<any> {
|
||||
const session = 'crawlsy-session-' + Date.now();
|
||||
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: {
|
||||
dispensaryId,
|
||||
pricingType: 'rec',
|
||||
Status: null, // null to include all (in-stock and out-of-stock)
|
||||
types: [],
|
||||
useCache: true,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false
|
||||
},
|
||||
page,
|
||||
perPage
|
||||
};
|
||||
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.FilteredProducts } })
|
||||
});
|
||||
|
||||
const res = await fetch(`https://dutchie.com/api-3/graphql?${qs.toString()}`, {
|
||||
headers: {
|
||||
'x-dutchie-session': session,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
'content-type': 'application/json',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
||||
}
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const text = await res.text();
|
||||
console.error('HTTP Status:', res.status);
|
||||
console.error('Response:', text.substring(0, 500));
|
||||
throw new Error(`HTTP ${res.status}: ${text.substring(0, 200)}`);
|
||||
}
|
||||
|
||||
return res.json();
|
||||
}
|
||||
|
||||
async function resolveDispensaryId(cName: string): Promise<string | null> {
|
||||
const session = 'crawlsy-session-' + Date.now();
|
||||
|
||||
const variables = { input: { dispensaryId: cName } };
|
||||
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'GetAddressBasedDispensaryData',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: GRAPHQL_HASHES.GetAddressBasedDispensaryData } })
|
||||
});
|
||||
|
||||
const res = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
headers: {
|
||||
'x-dutchie-session': session,
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
'content-type': 'application/json',
|
||||
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
}
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
console.error('Failed to resolve dispensary ID:', res.status);
|
||||
return null;
|
||||
}
|
||||
|
||||
const data: any = await res.json();
|
||||
return data?.data?.getAddressBasedDispensaryData?.dispensaryData?.dispensaryId || null;
|
||||
}
|
||||
|
||||
function enumerateFields(obj: any, prefix = ''): string[] {
|
||||
const fields: string[] = [];
|
||||
|
||||
for (const [key, value] of Object.entries(obj)) {
|
||||
const path = prefix ? `${prefix}.${key}` : key;
|
||||
|
||||
if (value === null) {
|
||||
fields.push(`${path}: null`);
|
||||
} else if (Array.isArray(value)) {
|
||||
fields.push(`${path}: Array[${value.length}]`);
|
||||
if (value.length > 0 && typeof value[0] === 'object') {
|
||||
const subFields = enumerateFields(value[0], `${path}[0]`);
|
||||
fields.push(...subFields);
|
||||
}
|
||||
} else if (typeof value === 'object') {
|
||||
fields.push(`${path}: Object`);
|
||||
const subFields = enumerateFields(value, path);
|
||||
fields.push(...subFields);
|
||||
} else {
|
||||
const typeStr = typeof value;
|
||||
const preview = String(value).substring(0, 50);
|
||||
fields.push(`${path}: ${typeStr} = "${preview}"`);
|
||||
}
|
||||
}
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log('='.repeat(80));
|
||||
console.log('DUTCHIE GRAPHQL API TEST');
|
||||
console.log('='.repeat(80));
|
||||
|
||||
const cName = 'AZ-Deeply-Rooted';
|
||||
|
||||
// Step 1: Resolve dispensary ID
|
||||
console.log(`\n1. Resolving dispensary ID for "${cName}"...`);
|
||||
const dispensaryId = await resolveDispensaryId(cName);
|
||||
|
||||
const finalDispensaryId = dispensaryId || '6405ef617056e8014d79101b'; // Fallback to known ID
|
||||
if (!dispensaryId) {
|
||||
console.log(' Failed to resolve via API, using hardcoded ID: 6405ef617056e8014d79101b');
|
||||
}
|
||||
|
||||
console.log(` Final ID: ${finalDispensaryId}`);
|
||||
|
||||
// Step 2: Fetch first page of products
|
||||
console.log('\n2. Fetching products (page 0, perPage 5)...');
|
||||
const result = await fetchProducts(finalDispensaryId, 0, 5);
|
||||
|
||||
if (result.errors) {
|
||||
console.error('\nGraphQL Errors:');
|
||||
console.error(JSON.stringify(result.errors, null, 2));
|
||||
return;
|
||||
}
|
||||
|
||||
const products = result?.data?.filteredProducts?.products || [];
|
||||
console.log(` Found ${products.length} products in this page`);
|
||||
|
||||
if (products.length === 0) {
|
||||
console.log('No products returned. Full response:');
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
return;
|
||||
}
|
||||
|
||||
// Step 3: Enumerate all fields from first product
|
||||
console.log('\n3. PRODUCT FIELD STRUCTURE (from first product):');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
const product = products[0];
|
||||
const fields = enumerateFields(product);
|
||||
fields.forEach(f => console.log(` ${f}`));
|
||||
|
||||
// Step 4: Show full sample product JSON
|
||||
console.log('\n4. FULL SAMPLE PRODUCT JSON:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(JSON.stringify(product, null, 2));
|
||||
|
||||
// Step 5: Summary of key fields for schema design
|
||||
console.log('\n5. KEY FIELDS FOR SCHEMA DESIGN:');
|
||||
console.log('-'.repeat(80));
|
||||
|
||||
const keyFields = [
|
||||
{ field: 'id', value: product.id },
|
||||
{ field: 'name', value: product.name },
|
||||
{ field: 'slug', value: product.slug },
|
||||
{ field: 'brand', value: product.brand },
|
||||
{ field: 'brandId', value: product.brandId },
|
||||
{ field: 'type', value: product.type },
|
||||
{ field: 'category', value: product.category },
|
||||
{ field: 'subcategory', value: product.subcategory },
|
||||
{ field: 'strainType', value: product.strainType },
|
||||
{ field: 'THCContent', value: product.THCContent },
|
||||
{ field: 'CBDContent', value: product.CBDContent },
|
||||
{ field: 'description', value: product.description?.substring(0, 100) + '...' },
|
||||
{ field: 'image', value: product.image },
|
||||
{ field: 'options.length', value: product.options?.length },
|
||||
{ field: 'pricing', value: product.pricing },
|
||||
{ field: 'terpenes.length', value: product.terpenes?.length },
|
||||
{ field: 'effects.length', value: product.effects?.length },
|
||||
];
|
||||
|
||||
keyFields.forEach(({ field, value }) => {
|
||||
console.log(` ${field}: ${JSON.stringify(value)}`);
|
||||
});
|
||||
|
||||
// Step 6: Show an option (variant) if available
|
||||
if (product.options && product.options.length > 0) {
|
||||
console.log('\n6. SAMPLE OPTION/VARIANT:');
|
||||
console.log('-'.repeat(80));
|
||||
console.log(JSON.stringify(product.options[0], null, 2));
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
106
backend/src/scripts/test-status-filter.ts
Normal file
106
backend/src/scripts/test-status-filter.ts
Normal file
@@ -0,0 +1,106 @@
|
||||
/**
|
||||
* Test different Status filter values in Dutchie GraphQL
|
||||
*/
|
||||
|
||||
import puppeteer from 'puppeteer-extra';
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const GRAPHQL_HASH = 'ee29c060826dc41c527e470e9ae502c9b2c169720faa0a9f5d25e1b9a530a4a0';
|
||||
|
||||
async function main() {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: 'new',
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||
});
|
||||
|
||||
const page = await browser.newPage();
|
||||
await page.setUserAgent(
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/120.0.0.0 Safari/537.36'
|
||||
);
|
||||
|
||||
console.log('Loading menu...');
|
||||
await page.goto('https://dutchie.com/embedded-menu/AZ-Deeply-Rooted', {
|
||||
waitUntil: 'networkidle2',
|
||||
timeout: 60000,
|
||||
});
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
|
||||
const dispensaryId = await page.evaluate(() => (window as any).reactEnv?.dispensaryId);
|
||||
console.log('Dispensary ID:', dispensaryId);
|
||||
|
||||
// Test different status values
|
||||
const testCases = [
|
||||
{ label: 'Active', status: 'Active', includeStatus: true },
|
||||
{ label: 'Inactive', status: 'Inactive', includeStatus: true },
|
||||
{ label: 'null', status: null, includeStatus: true },
|
||||
{ label: 'omitted', status: null, includeStatus: false },
|
||||
];
|
||||
|
||||
for (const testCase of testCases) {
|
||||
const result = await page.evaluate(
|
||||
async (dispId: string, hash: string, status: string | null, includeStatus: boolean) => {
|
||||
const filter: any = {
|
||||
dispensaryId: dispId,
|
||||
pricingType: 'rec',
|
||||
types: [],
|
||||
useCache: false,
|
||||
isDefaultSort: true,
|
||||
sortBy: 'popularSortIdx',
|
||||
sortDirection: 1,
|
||||
bypassOnlineThresholds: true,
|
||||
isKioskMenu: false,
|
||||
removeProductsBelowOptionThresholds: false,
|
||||
};
|
||||
|
||||
if (includeStatus) {
|
||||
filter.Status = status;
|
||||
}
|
||||
|
||||
const variables = {
|
||||
includeEnterpriseSpecials: false,
|
||||
productsFilter: filter,
|
||||
page: 0,
|
||||
perPage: 100,
|
||||
};
|
||||
|
||||
const qs = new URLSearchParams({
|
||||
operationName: 'FilteredProducts',
|
||||
variables: JSON.stringify(variables),
|
||||
extensions: JSON.stringify({ persistedQuery: { version: 1, sha256Hash: hash } }),
|
||||
});
|
||||
|
||||
const resp = await fetch(`https://dutchie.com/graphql?${qs.toString()}`, {
|
||||
method: 'GET',
|
||||
headers: {
|
||||
'content-type': 'application/json',
|
||||
'apollographql-client-name': 'Marketplace (production)',
|
||||
},
|
||||
credentials: 'include',
|
||||
});
|
||||
|
||||
const json = await resp.json();
|
||||
const products = json?.data?.filteredProducts?.products || [];
|
||||
return {
|
||||
count: products.length,
|
||||
totalCount: json?.data?.filteredProducts?.queryInfo?.totalCount,
|
||||
sampleStatus: products[0]?.Status,
|
||||
statuses: [...new Set(products.map((p: any) => p.Status))],
|
||||
};
|
||||
},
|
||||
dispensaryId,
|
||||
GRAPHQL_HASH,
|
||||
testCase.status,
|
||||
testCase.includeStatus
|
||||
);
|
||||
|
||||
console.log(
|
||||
`Status ${testCase.label}: Products=${result.count}, Total=${result.totalCount}, Statuses=${JSON.stringify(result.statuses)}`
|
||||
);
|
||||
}
|
||||
|
||||
await browser.close();
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
@@ -20,7 +20,13 @@ import {
|
||||
MultiCategoryDetectionResult,
|
||||
} from './intelligence-detector';
|
||||
import { runCrawlProductsJob, runSandboxProductsJob } from './category-crawler-jobs';
|
||||
import { scrapeStore } from '../scraper-v2';
|
||||
// DEPRECATED: scrapeStore writes to legacy products table
|
||||
// import { scrapeStore } from '../scraper-v2';
|
||||
|
||||
// Import the new dutchie-az pipeline for Dutchie crawling
|
||||
import { crawlDispensaryProducts } from '../dutchie-az/services/product-crawler';
|
||||
import { query as dutchieAzQuery } from '../dutchie-az/db/connection';
|
||||
import { Dispensary as DutchieAzDispensary } from '../dutchie-az/types';
|
||||
|
||||
// ========================================
|
||||
// Types
|
||||
@@ -159,24 +165,43 @@ export async function runStoreCrawlOrchestrator(storeId: number): Promise<Orches
|
||||
const mode = store.product_crawler_mode;
|
||||
|
||||
if (provider === 'dutchie' && mode === 'production') {
|
||||
// Production Dutchie crawl
|
||||
await updateScheduleStatus(storeId, 'running', 'Running Dutchie production crawl...', runId);
|
||||
// Production Dutchie crawl - now uses the new dutchie-az GraphQL pipeline
|
||||
await updateScheduleStatus(storeId, 'running', 'Running Dutchie GraphQL crawl (dutchie-az)...', runId);
|
||||
|
||||
try {
|
||||
// Run the actual scraper
|
||||
await scrapeStore(storeId);
|
||||
// Look up the dispensary in the dutchie-az database
|
||||
// The dutchie-az pipeline has its own dispensaries table
|
||||
// We try multiple matching strategies: name, slug, or platform_dispensary_id
|
||||
const dispensaryResult = await dutchieAzQuery<DutchieAzDispensary>(
|
||||
`SELECT * FROM dispensaries
|
||||
WHERE name ILIKE $1
|
||||
OR slug ILIKE $2
|
||||
LIMIT 1`,
|
||||
[store.dispensary_name, store.slug]
|
||||
);
|
||||
|
||||
// Get crawl stats from the latest job
|
||||
const stats = await getLatestCrawlStats(storeId);
|
||||
if (dispensaryResult.rows.length === 0) {
|
||||
throw new Error(
|
||||
`Dispensary not found in dutchie-az database. ` +
|
||||
`You must add this dispensary to the dutchie-az pipeline first. ` +
|
||||
`Store: ${store.name} (${store.dispensary_name})`
|
||||
);
|
||||
}
|
||||
|
||||
const dutchieDispensary = dispensaryResult.rows[0];
|
||||
|
||||
// Run the new dutchie-az GraphQL crawler
|
||||
const crawlResult = await crawlDispensaryProducts(dutchieDispensary, 'rec', { useBothModes: true });
|
||||
|
||||
result.crawlRan = true;
|
||||
result.crawlType = 'production';
|
||||
result.productsFound = stats.products_found ?? undefined;
|
||||
result.productsNew = stats.products_new ?? undefined;
|
||||
result.productsUpdated = stats.products_updated ?? undefined;
|
||||
result.productsFound = crawlResult.productsFound ?? undefined;
|
||||
result.productsNew = crawlResult.productsUpserted ?? undefined;
|
||||
result.productsUpdated = crawlResult.snapshotsCreated ?? undefined;
|
||||
|
||||
if (crawlResult.success) {
|
||||
const detectionPart = result.detectionRan ? 'Detection + ' : '';
|
||||
result.summary = `${detectionPart}Dutchie products crawl (${stats.products_found || 0} items, ${stats.products_new || 0} new, ${stats.products_updated || 0} updated)`;
|
||||
result.summary = `${detectionPart}Dutchie GraphQL crawl (${crawlResult.productsFound || 0} items, ${crawlResult.productsUpserted || 0} upserted, ${crawlResult.snapshotsCreated || 0} snapshots)`;
|
||||
result.status = 'success';
|
||||
|
||||
// Update store's last_scraped_at
|
||||
@@ -186,12 +211,15 @@ export async function runStoreCrawlOrchestrator(storeId: number): Promise<Orches
|
||||
job_id: 0, // Orchestrator doesn't create traditional jobs
|
||||
store_id: storeId,
|
||||
store_name: store.name,
|
||||
duration_ms: Date.now() - startTime,
|
||||
products_found: stats.products_found || 0,
|
||||
products_new: stats.products_new || 0,
|
||||
products_updated: stats.products_updated || 0,
|
||||
duration_ms: crawlResult.durationMs,
|
||||
products_found: crawlResult.productsFound || 0,
|
||||
products_new: crawlResult.productsUpserted || 0,
|
||||
products_updated: crawlResult.snapshotsCreated || 0,
|
||||
provider: 'dutchie',
|
||||
});
|
||||
} else {
|
||||
throw new Error(crawlResult.errorMessage || 'Crawl failed');
|
||||
}
|
||||
|
||||
} catch (crawlError: any) {
|
||||
result.status = 'error';
|
||||
|
||||
322
backend/src/utils/image-storage.ts
Normal file
322
backend/src/utils/image-storage.ts
Normal file
@@ -0,0 +1,322 @@
|
||||
/**
|
||||
* Local Image Storage Utility
|
||||
*
|
||||
* Downloads and stores product images to local filesystem.
|
||||
* Replaces MinIO-based storage with simple local file storage.
|
||||
*
|
||||
* Directory structure:
|
||||
* /images/products/<dispensary_id>/<product_id>.webp
|
||||
* /images/products/<dispensary_id>/<product_id>-thumb.webp
|
||||
* /images/products/<dispensary_id>/<product_id>-medium.webp
|
||||
* /images/brands/<brand_slug>.webp
|
||||
*/
|
||||
|
||||
import axios from 'axios';
|
||||
import sharp from 'sharp';
|
||||
import * as fs from 'fs/promises';
|
||||
import * as path from 'path';
|
||||
import { createHash } from 'crypto';
|
||||
|
||||
// Base path for image storage - configurable via env
|
||||
const IMAGES_BASE_PATH = process.env.IMAGES_PATH || '/app/public/images';
|
||||
|
||||
// Public URL base for serving images
|
||||
const IMAGES_PUBLIC_URL = process.env.IMAGES_PUBLIC_URL || '/images';
|
||||
|
||||
export interface LocalImageSizes {
|
||||
full: string; // URL path: /images/products/123/456.webp
|
||||
medium: string; // URL path: /images/products/123/456-medium.webp
|
||||
thumb: string; // URL path: /images/products/123/456-thumb.webp
|
||||
}
|
||||
|
||||
export interface DownloadResult {
|
||||
success: boolean;
|
||||
urls?: LocalImageSizes;
|
||||
error?: string;
|
||||
bytesDownloaded?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure a directory exists
|
||||
*/
|
||||
async function ensureDir(dirPath: string): Promise<void> {
|
||||
try {
|
||||
await fs.mkdir(dirPath, { recursive: true });
|
||||
} catch (error: any) {
|
||||
if (error.code !== 'EEXIST') throw error;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a short hash from a URL for deduplication
|
||||
*/
|
||||
function hashUrl(url: string): string {
|
||||
return createHash('md5').update(url).digest('hex').substring(0, 8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Download an image from a URL and return the buffer
|
||||
*/
|
||||
async function downloadImage(imageUrl: string): Promise<Buffer> {
|
||||
const response = await axios.get(imageUrl, {
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 30000,
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
|
||||
},
|
||||
});
|
||||
return Buffer.from(response.data);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process and save image in multiple sizes
|
||||
* Returns the file paths relative to IMAGES_BASE_PATH
|
||||
*/
|
||||
async function processAndSaveImage(
|
||||
buffer: Buffer,
|
||||
outputDir: string,
|
||||
baseFilename: string
|
||||
): Promise<{ full: string; medium: string; thumb: string; totalBytes: number }> {
|
||||
await ensureDir(outputDir);
|
||||
|
||||
const fullPath = path.join(outputDir, `${baseFilename}.webp`);
|
||||
const mediumPath = path.join(outputDir, `${baseFilename}-medium.webp`);
|
||||
const thumbPath = path.join(outputDir, `${baseFilename}-thumb.webp`);
|
||||
|
||||
// Process images in parallel
|
||||
const [fullBuffer, mediumBuffer, thumbBuffer] = await Promise.all([
|
||||
// Full: max 1200x1200, high quality
|
||||
sharp(buffer)
|
||||
.resize(1200, 1200, { fit: 'inside', withoutEnlargement: true })
|
||||
.webp({ quality: 85 })
|
||||
.toBuffer(),
|
||||
// Medium: 600x600
|
||||
sharp(buffer)
|
||||
.resize(600, 600, { fit: 'inside', withoutEnlargement: true })
|
||||
.webp({ quality: 80 })
|
||||
.toBuffer(),
|
||||
// Thumb: 200x200
|
||||
sharp(buffer)
|
||||
.resize(200, 200, { fit: 'inside', withoutEnlargement: true })
|
||||
.webp({ quality: 75 })
|
||||
.toBuffer(),
|
||||
]);
|
||||
|
||||
// Save all sizes
|
||||
await Promise.all([
|
||||
fs.writeFile(fullPath, fullBuffer),
|
||||
fs.writeFile(mediumPath, mediumBuffer),
|
||||
fs.writeFile(thumbPath, thumbBuffer),
|
||||
]);
|
||||
|
||||
const totalBytes = fullBuffer.length + mediumBuffer.length + thumbBuffer.length;
|
||||
|
||||
return {
|
||||
full: fullPath,
|
||||
medium: mediumPath,
|
||||
thumb: thumbPath,
|
||||
totalBytes,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a file path to a public URL
|
||||
*/
|
||||
function pathToUrl(filePath: string): string {
|
||||
const relativePath = filePath.replace(IMAGES_BASE_PATH, '');
|
||||
return `${IMAGES_PUBLIC_URL}${relativePath}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Download and store a product image locally
|
||||
*
|
||||
* @param imageUrl - The third-party image URL to download
|
||||
* @param dispensaryId - The dispensary ID (for directory organization)
|
||||
* @param productId - The product ID or external ID (for filename)
|
||||
* @returns Download result with local URLs
|
||||
*/
|
||||
export async function downloadProductImage(
|
||||
imageUrl: string,
|
||||
dispensaryId: number,
|
||||
productId: string | number
|
||||
): Promise<DownloadResult> {
|
||||
try {
|
||||
if (!imageUrl) {
|
||||
return { success: false, error: 'No image URL provided' };
|
||||
}
|
||||
|
||||
// Download the image
|
||||
const buffer = await downloadImage(imageUrl);
|
||||
|
||||
// Organize by dispensary ID
|
||||
const outputDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
|
||||
|
||||
// Use product ID + URL hash for uniqueness
|
||||
const urlHash = hashUrl(imageUrl);
|
||||
const baseFilename = `${productId}-${urlHash}`;
|
||||
|
||||
// Process and save
|
||||
const result = await processAndSaveImage(buffer, outputDir, baseFilename);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
urls: {
|
||||
full: pathToUrl(result.full),
|
||||
medium: pathToUrl(result.medium),
|
||||
thumb: pathToUrl(result.thumb),
|
||||
},
|
||||
bytesDownloaded: result.totalBytes,
|
||||
};
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || 'Failed to download image',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Download and store a brand logo locally
|
||||
*
|
||||
* @param logoUrl - The brand logo URL
|
||||
* @param brandId - The brand ID or slug
|
||||
* @returns Download result with local URL
|
||||
*/
|
||||
export async function downloadBrandLogo(
|
||||
logoUrl: string,
|
||||
brandId: string
|
||||
): Promise<DownloadResult> {
|
||||
try {
|
||||
if (!logoUrl) {
|
||||
return { success: false, error: 'No logo URL provided' };
|
||||
}
|
||||
|
||||
// Download the image
|
||||
const buffer = await downloadImage(logoUrl);
|
||||
|
||||
// Brand logos go in /images/brands/
|
||||
const outputDir = path.join(IMAGES_BASE_PATH, 'brands');
|
||||
|
||||
// Sanitize brand ID for filename
|
||||
const safeBrandId = brandId.replace(/[^a-zA-Z0-9-_]/g, '_');
|
||||
const urlHash = hashUrl(logoUrl);
|
||||
const baseFilename = `${safeBrandId}-${urlHash}`;
|
||||
|
||||
// Process and save (single size for logos)
|
||||
await ensureDir(outputDir);
|
||||
const logoPath = path.join(outputDir, `${baseFilename}.webp`);
|
||||
|
||||
const logoBuffer = await sharp(buffer)
|
||||
.resize(400, 400, { fit: 'inside', withoutEnlargement: true })
|
||||
.webp({ quality: 85 })
|
||||
.toBuffer();
|
||||
|
||||
await fs.writeFile(logoPath, logoBuffer);
|
||||
|
||||
return {
|
||||
success: true,
|
||||
urls: {
|
||||
full: pathToUrl(logoPath),
|
||||
medium: pathToUrl(logoPath),
|
||||
thumb: pathToUrl(logoPath),
|
||||
},
|
||||
bytesDownloaded: logoBuffer.length,
|
||||
};
|
||||
} catch (error: any) {
|
||||
return {
|
||||
success: false,
|
||||
error: error.message || 'Failed to download brand logo',
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a local image already exists
|
||||
*/
|
||||
export async function imageExists(
|
||||
dispensaryId: number,
|
||||
productId: string | number,
|
||||
imageUrl: string
|
||||
): Promise<boolean> {
|
||||
const urlHash = hashUrl(imageUrl);
|
||||
const imagePath = path.join(
|
||||
IMAGES_BASE_PATH,
|
||||
'products',
|
||||
String(dispensaryId),
|
||||
`${productId}-${urlHash}.webp`
|
||||
);
|
||||
try {
|
||||
await fs.access(imagePath);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a product's local images
|
||||
*/
|
||||
export async function deleteProductImages(
|
||||
dispensaryId: number,
|
||||
productId: string | number,
|
||||
imageUrl?: string
|
||||
): Promise<void> {
|
||||
const productDir = path.join(IMAGES_BASE_PATH, 'products', String(dispensaryId));
|
||||
const prefix = imageUrl
|
||||
? `${productId}-${hashUrl(imageUrl)}`
|
||||
: String(productId);
|
||||
|
||||
try {
|
||||
const files = await fs.readdir(productDir);
|
||||
const toDelete = files.filter(f => f.startsWith(prefix));
|
||||
await Promise.all(toDelete.map(f => fs.unlink(path.join(productDir, f))));
|
||||
} catch {
|
||||
// Directory might not exist, that's fine
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the image storage directories
|
||||
*/
|
||||
export async function initializeImageStorage(): Promise<void> {
|
||||
await ensureDir(path.join(IMAGES_BASE_PATH, 'products'));
|
||||
await ensureDir(path.join(IMAGES_BASE_PATH, 'brands'));
|
||||
console.log(`✅ Image storage initialized at ${IMAGES_BASE_PATH}`);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get storage stats
|
||||
*/
|
||||
export async function getStorageStats(): Promise<{
|
||||
productsDir: string;
|
||||
brandsDir: string;
|
||||
productCount: number;
|
||||
brandCount: number;
|
||||
}> {
|
||||
const productsDir = path.join(IMAGES_BASE_PATH, 'products');
|
||||
const brandsDir = path.join(IMAGES_BASE_PATH, 'brands');
|
||||
|
||||
let productCount = 0;
|
||||
let brandCount = 0;
|
||||
|
||||
try {
|
||||
const productDirs = await fs.readdir(productsDir);
|
||||
for (const dir of productDirs) {
|
||||
const files = await fs.readdir(path.join(productsDir, dir));
|
||||
productCount += files.filter(f => f.endsWith('.webp') && !f.includes('-')).length;
|
||||
}
|
||||
} catch { /* ignore */ }
|
||||
|
||||
try {
|
||||
const brandFiles = await fs.readdir(brandsDir);
|
||||
brandCount = brandFiles.filter(f => f.endsWith('.webp')).length;
|
||||
} catch { /* ignore */ }
|
||||
|
||||
return {
|
||||
productsDir,
|
||||
brandsDir,
|
||||
productCount,
|
||||
brandCount,
|
||||
};
|
||||
}
|
||||
206
backend/src/utils/product-normalizer.ts
Normal file
206
backend/src/utils/product-normalizer.ts
Normal file
@@ -0,0 +1,206 @@
|
||||
/**
|
||||
* Product Normalizer Utility
|
||||
*
|
||||
* Functions for normalizing product data to enable consistent matching
|
||||
* and prevent duplicate product entries.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Normalize product name for matching
|
||||
* - Lowercase
|
||||
* - Remove punctuation
|
||||
* - Remove THC/CBD percentages often appended to names
|
||||
* - Remove weight suffixes
|
||||
* - Remove emoji
|
||||
* - Normalize whitespace
|
||||
*/
|
||||
export function normalizeProductName(name: string): string {
|
||||
if (!name) return '';
|
||||
|
||||
return name
|
||||
.toLowerCase()
|
||||
.trim()
|
||||
// Remove special characters except alphanumeric and spaces
|
||||
.replace(/[^\w\s]/g, ' ')
|
||||
// Remove common suffixes like THC/CBD percentages appended to names
|
||||
.replace(/\s*(thc|cbd|cbg|cbn|tac)\s*[:=]?\s*[\d.]+\s*%?/gi, '')
|
||||
// Remove weight/size suffixes often appended
|
||||
.replace(/\s*\d+(\.\d+)?\s*(mg|g|oz|ml|gram|grams|ounce|ounces)\b/gi, '')
|
||||
// Remove emoji
|
||||
.replace(/[\u{1F300}-\u{1F9FF}]/gu, '')
|
||||
// Remove "special offer" type suffixes
|
||||
.replace(/\s*special\s*offer\s*/gi, '')
|
||||
// Normalize multiple spaces to single space
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize brand name for matching
|
||||
*/
|
||||
export function normalizeBrandName(brand: string | null | undefined): string {
|
||||
if (!brand) return '';
|
||||
|
||||
return brand
|
||||
.toLowerCase()
|
||||
.trim()
|
||||
// Remove special characters
|
||||
.replace(/[^\w\s]/g, ' ')
|
||||
// Normalize whitespace
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize weight string to standard format
|
||||
* e.g., "3.5 grams" -> "3.5g", "1/8 oz" -> "3.5g"
|
||||
*/
|
||||
export function normalizeWeight(weight: string | null | undefined): string {
|
||||
if (!weight) return '';
|
||||
|
||||
const w = weight.toLowerCase().trim();
|
||||
|
||||
// Handle fractional ounces
|
||||
if (w.includes('1/8') || w.includes('eighth')) {
|
||||
return '3.5g';
|
||||
}
|
||||
if (w.includes('1/4') || w.includes('quarter')) {
|
||||
return '7g';
|
||||
}
|
||||
if (w.includes('1/2') || w.includes('half')) {
|
||||
return '14g';
|
||||
}
|
||||
if (w.includes('1 oz') || w === 'oz' || w === '1oz') {
|
||||
return '28g';
|
||||
}
|
||||
|
||||
// Extract numeric value and unit
|
||||
const match = w.match(/([\d.]+)\s*(mg|g|oz|ml|gram|grams?|ounce|ounces?)?/i);
|
||||
if (!match) return w;
|
||||
|
||||
const value = parseFloat(match[1]);
|
||||
let unit = (match[2] || 'g').toLowerCase();
|
||||
|
||||
// Normalize unit names
|
||||
unit = unit.replace(/gram(s)?/, 'g').replace(/ounce(s)?/, 'oz');
|
||||
|
||||
// Convert oz to grams for consistency
|
||||
if (unit === 'oz') {
|
||||
return `${(value * 28).toFixed(1)}g`;
|
||||
}
|
||||
|
||||
return `${value}${unit}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a matching fingerprint for a product
|
||||
* Used for deduplication
|
||||
*/
|
||||
export function generateProductFingerprint(
|
||||
name: string,
|
||||
brand: string | null | undefined,
|
||||
weight: string | null | undefined,
|
||||
categoryId: number | null | undefined
|
||||
): string {
|
||||
const parts = [
|
||||
normalizeProductName(name),
|
||||
normalizeBrandName(brand),
|
||||
normalizeWeight(weight),
|
||||
categoryId?.toString() || ''
|
||||
];
|
||||
|
||||
return parts.filter(Boolean).join('|');
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate similarity between two strings (0-100)
|
||||
* Uses Levenshtein distance
|
||||
*/
|
||||
export function stringSimilarity(str1: string, str2: string): number {
|
||||
if (str1 === str2) return 100;
|
||||
if (!str1 || !str2) return 0;
|
||||
|
||||
const s1 = str1.toLowerCase();
|
||||
const s2 = str2.toLowerCase();
|
||||
|
||||
if (s1 === s2) return 100;
|
||||
|
||||
const longer = s1.length > s2.length ? s1 : s2;
|
||||
const shorter = s1.length > s2.length ? s2 : s1;
|
||||
|
||||
const longerLength = longer.length;
|
||||
if (longerLength === 0) return 100;
|
||||
|
||||
const distance = levenshteinDistance(longer, shorter);
|
||||
return Math.round(((longerLength - distance) / longerLength) * 100);
|
||||
}
|
||||
|
||||
/**
|
||||
* Levenshtein distance between two strings
|
||||
*/
|
||||
function levenshteinDistance(str1: string, str2: string): number {
|
||||
const m = str1.length;
|
||||
const n = str2.length;
|
||||
|
||||
// Create distance matrix
|
||||
const dp: number[][] = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
|
||||
|
||||
// Initialize first row and column
|
||||
for (let i = 0; i <= m; i++) dp[i][0] = i;
|
||||
for (let j = 0; j <= n; j++) dp[0][j] = j;
|
||||
|
||||
// Fill in the rest
|
||||
for (let i = 1; i <= m; i++) {
|
||||
for (let j = 1; j <= n; j++) {
|
||||
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
||||
dp[i][j] = Math.min(
|
||||
dp[i - 1][j] + 1, // deletion
|
||||
dp[i][j - 1] + 1, // insertion
|
||||
dp[i - 1][j - 1] + cost // substitution
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return dp[m][n];
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if two products are likely the same
|
||||
* Returns confidence score (0-100)
|
||||
*/
|
||||
export function areProductsSimilar(
|
||||
product1: { name: string; brand?: string | null; weight?: string | null },
|
||||
product2: { name: string; brand?: string | null; weight?: string | null },
|
||||
threshold: number = 92
|
||||
): { isSimilar: boolean; confidence: number } {
|
||||
const name1 = normalizeProductName(product1.name);
|
||||
const name2 = normalizeProductName(product2.name);
|
||||
|
||||
const nameSimilarity = stringSimilarity(name1, name2);
|
||||
|
||||
// If names are very similar, likely same product
|
||||
if (nameSimilarity >= threshold) {
|
||||
return { isSimilar: true, confidence: nameSimilarity };
|
||||
}
|
||||
|
||||
// Check brand match for additional confidence
|
||||
const brand1 = normalizeBrandName(product1.brand);
|
||||
const brand2 = normalizeBrandName(product2.brand);
|
||||
|
||||
if (brand1 && brand2 && brand1 === brand2) {
|
||||
// Same brand, lower threshold for name match
|
||||
if (nameSimilarity >= threshold - 10) {
|
||||
return { isSimilar: true, confidence: nameSimilarity + 5 };
|
||||
}
|
||||
}
|
||||
|
||||
// Check weight match
|
||||
const weight1 = normalizeWeight(product1.weight);
|
||||
const weight2 = normalizeWeight(product2.weight);
|
||||
|
||||
if (weight1 && weight2 && weight1 === weight2 && nameSimilarity >= threshold - 15) {
|
||||
return { isSimilar: true, confidence: nameSimilarity + 3 };
|
||||
}
|
||||
|
||||
return { isSimilar: false, confidence: nameSimilarity };
|
||||
}
|
||||
@@ -456,8 +456,12 @@ class ApiClient {
|
||||
}
|
||||
|
||||
// Dispensary Schedule (new dispensary-centric API)
|
||||
async getDispensarySchedules() {
|
||||
return this.request<{ dispensaries: any[] }>('/api/schedule/dispensaries');
|
||||
async getDispensarySchedules(filters?: { state?: string; search?: string }) {
|
||||
const params = new URLSearchParams();
|
||||
if (filters?.state) params.append('state', filters.state);
|
||||
if (filters?.search) params.append('search', filters.search);
|
||||
const queryString = params.toString();
|
||||
return this.request<{ dispensaries: any[] }>(`/api/schedule/dispensaries${queryString ? `?${queryString}` : ''}`);
|
||||
}
|
||||
|
||||
async getDispensarySchedule(dispensaryId: number) {
|
||||
@@ -482,6 +486,63 @@ class ApiClient {
|
||||
});
|
||||
}
|
||||
|
||||
async resolvePlatformId(dispensaryId: number) {
|
||||
return this.request<{
|
||||
success: boolean;
|
||||
platform_dispensary_id?: string;
|
||||
slug_resolved?: string;
|
||||
message: string;
|
||||
already_resolved?: boolean;
|
||||
error?: string;
|
||||
}>(`/api/schedule/dispensaries/${dispensaryId}/resolve-platform-id`, {
|
||||
method: 'POST',
|
||||
});
|
||||
}
|
||||
|
||||
async detectMenuType(dispensaryId: number) {
|
||||
return this.request<{
|
||||
success: boolean;
|
||||
menu_type: string;
|
||||
url_checked: string;
|
||||
message: string;
|
||||
}>(`/api/schedule/dispensaries/${dispensaryId}/detect-menu-type`, {
|
||||
method: 'POST',
|
||||
});
|
||||
}
|
||||
|
||||
async refreshDetection(dispensaryId: number) {
|
||||
return this.request<{
|
||||
success: boolean;
|
||||
menu_type: string;
|
||||
platform_dispensary_id: string | null;
|
||||
url_checked: string;
|
||||
can_crawl: boolean;
|
||||
}>(`/api/schedule/dispensaries/${dispensaryId}/refresh-detection`, {
|
||||
method: 'POST',
|
||||
});
|
||||
}
|
||||
|
||||
async toggleDispensarySchedule(dispensaryId: number, isActive: boolean) {
|
||||
return this.request<{
|
||||
success: boolean;
|
||||
schedule: any;
|
||||
message: string;
|
||||
}>(`/api/schedule/dispensaries/${dispensaryId}/toggle-active`, {
|
||||
method: 'PUT',
|
||||
body: JSON.stringify({ is_active: isActive }),
|
||||
});
|
||||
}
|
||||
|
||||
async deleteDispensarySchedule(dispensaryId: number) {
|
||||
return this.request<{
|
||||
success: boolean;
|
||||
deleted: boolean;
|
||||
message: string;
|
||||
}>(`/api/schedule/dispensaries/${dispensaryId}/schedule`, {
|
||||
method: 'DELETE',
|
||||
});
|
||||
}
|
||||
|
||||
async getCrawlJobs(limit?: number) {
|
||||
const params = limit ? `?limit=${limit}` : '';
|
||||
return this.request<{ jobs: any[] }>(`/api/schedule/jobs${params}`);
|
||||
|
||||
@@ -18,21 +18,27 @@ interface DispensarySchedule {
|
||||
dispensary_name: string;
|
||||
city: string | null;
|
||||
state: string | null;
|
||||
dispensary_slug: string | null;
|
||||
slug: string | null;
|
||||
website: string | null;
|
||||
menu_url: string | null;
|
||||
menu_type: string | null;
|
||||
platform_dispensary_id: string | null;
|
||||
product_provider: string | null;
|
||||
provider_type: string | null;
|
||||
product_confidence: number | null;
|
||||
product_crawler_mode: string | null;
|
||||
last_product_scan_at: string | null;
|
||||
is_active: boolean;
|
||||
schedule_active: boolean;
|
||||
interval_minutes: number;
|
||||
interval_minutes: number | null;
|
||||
priority: number;
|
||||
last_run_at: string | null;
|
||||
next_run_at: string | null;
|
||||
schedule_last_status: string | null;
|
||||
last_status: string | null;
|
||||
last_summary: string | null;
|
||||
schedule_last_error: string | null;
|
||||
last_error: string | null;
|
||||
consecutive_failures: number | null;
|
||||
total_runs: number | null;
|
||||
@@ -42,6 +48,9 @@ interface DispensarySchedule {
|
||||
latest_job_status: string | null;
|
||||
latest_job_started: string | null;
|
||||
latest_products_found: number | null;
|
||||
// Computed from view
|
||||
can_crawl: boolean;
|
||||
schedule_status_reason: string | null;
|
||||
}
|
||||
|
||||
interface CrawlJob {
|
||||
@@ -69,6 +78,21 @@ export function ScraperSchedule() {
|
||||
const [autoRefresh, setAutoRefresh] = useState(true);
|
||||
const [activeTab, setActiveTab] = useState<'dispensaries' | 'jobs' | 'global'>('dispensaries');
|
||||
const [triggeringDispensary, setTriggeringDispensary] = useState<number | null>(null);
|
||||
const [resolvingId, setResolvingId] = useState<number | null>(null);
|
||||
const [refreshingDetection, setRefreshingDetection] = useState<number | null>(null);
|
||||
const [togglingSchedule, setTogglingSchedule] = useState<number | null>(null);
|
||||
const [filterDutchieOnly, setFilterDutchieOnly] = useState(false);
|
||||
const [stateFilter, setStateFilter] = useState<'all' | 'AZ'>('all');
|
||||
const [searchTerm, setSearchTerm] = useState('');
|
||||
const [searchInput, setSearchInput] = useState(''); // For debouncing
|
||||
|
||||
// Debounce search input
|
||||
useEffect(() => {
|
||||
const timer = setTimeout(() => {
|
||||
setSearchTerm(searchInput);
|
||||
}, 300);
|
||||
return () => clearTimeout(timer);
|
||||
}, [searchInput]);
|
||||
|
||||
useEffect(() => {
|
||||
loadData();
|
||||
@@ -77,13 +101,22 @@ export function ScraperSchedule() {
|
||||
const interval = setInterval(loadData, 5000);
|
||||
return () => clearInterval(interval);
|
||||
}
|
||||
}, [autoRefresh]);
|
||||
}, [autoRefresh, stateFilter, searchTerm]);
|
||||
|
||||
const loadData = async () => {
|
||||
try {
|
||||
// Build filters for dispensary schedules
|
||||
const filters: { state?: string; search?: string } = {};
|
||||
if (stateFilter === 'AZ') {
|
||||
filters.state = 'AZ';
|
||||
}
|
||||
if (searchTerm.trim()) {
|
||||
filters.search = searchTerm.trim();
|
||||
}
|
||||
|
||||
const [globalData, dispensaryData, jobsData] = await Promise.all([
|
||||
api.getGlobalSchedule(),
|
||||
api.getDispensarySchedules(),
|
||||
api.getDispensarySchedules(Object.keys(filters).length > 0 ? filters : undefined),
|
||||
api.getDispensaryCrawlJobs(100)
|
||||
]);
|
||||
|
||||
@@ -129,6 +162,62 @@ export function ScraperSchedule() {
|
||||
}
|
||||
};
|
||||
|
||||
const handleResolvePlatformId = async (dispensaryId: number) => {
|
||||
setResolvingId(dispensaryId);
|
||||
try {
|
||||
const result = await api.resolvePlatformId(dispensaryId);
|
||||
if (result.success) {
|
||||
alert(result.message);
|
||||
} else {
|
||||
alert(`Failed: ${result.error || result.message}`);
|
||||
}
|
||||
await loadData();
|
||||
} catch (error: any) {
|
||||
console.error('Failed to resolve platform ID:', error);
|
||||
alert(`Error: ${error.message}`);
|
||||
} finally {
|
||||
setResolvingId(null);
|
||||
}
|
||||
};
|
||||
|
||||
const handleRefreshDetection = async (dispensaryId: number) => {
|
||||
setRefreshingDetection(dispensaryId);
|
||||
try {
|
||||
const result = await api.refreshDetection(dispensaryId);
|
||||
alert(`Detected: ${result.menu_type}${result.platform_dispensary_id ? `, Platform ID: ${result.platform_dispensary_id}` : ''}`);
|
||||
await loadData();
|
||||
} catch (error: any) {
|
||||
console.error('Failed to refresh detection:', error);
|
||||
alert(`Error: ${error.message}`);
|
||||
} finally {
|
||||
setRefreshingDetection(null);
|
||||
}
|
||||
};
|
||||
|
||||
const handleToggleSchedule = async (dispensaryId: number, currentActive: boolean) => {
|
||||
setTogglingSchedule(dispensaryId);
|
||||
try {
|
||||
await api.toggleDispensarySchedule(dispensaryId, !currentActive);
|
||||
await loadData();
|
||||
} catch (error: any) {
|
||||
console.error('Failed to toggle schedule:', error);
|
||||
alert(`Error: ${error.message}`);
|
||||
} finally {
|
||||
setTogglingSchedule(null);
|
||||
}
|
||||
};
|
||||
|
||||
const handleDeleteSchedule = async (dispensaryId: number) => {
|
||||
if (!confirm('Are you sure you want to delete this schedule?')) return;
|
||||
try {
|
||||
await api.deleteDispensarySchedule(dispensaryId);
|
||||
await loadData();
|
||||
} catch (error: any) {
|
||||
console.error('Failed to delete schedule:', error);
|
||||
alert(`Error: ${error.message}`);
|
||||
}
|
||||
};
|
||||
|
||||
const handleUpdateGlobalSchedule = async (type: string, data: any) => {
|
||||
try {
|
||||
await api.updateGlobalSchedule(type, data);
|
||||
@@ -373,32 +462,127 @@ export function ScraperSchedule() {
|
||||
)}
|
||||
|
||||
{activeTab === 'dispensaries' && (
|
||||
<div>
|
||||
{/* Filter Bar */}
|
||||
<div style={{ marginBottom: '15px', display: 'flex', gap: '20px', alignItems: 'center', flexWrap: 'wrap' }}>
|
||||
{/* State Filter Toggle */}
|
||||
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
|
||||
<span style={{ fontWeight: '500', color: '#374151' }}>State:</span>
|
||||
<div style={{ display: 'flex', borderRadius: '6px', overflow: 'hidden', border: '1px solid #d1d5db' }}>
|
||||
<button
|
||||
onClick={() => setStateFilter('all')}
|
||||
style={{
|
||||
padding: '6px 14px',
|
||||
background: stateFilter === 'all' ? '#2563eb' : 'white',
|
||||
color: stateFilter === 'all' ? 'white' : '#374151',
|
||||
border: 'none',
|
||||
cursor: 'pointer',
|
||||
fontSize: '14px',
|
||||
fontWeight: '500'
|
||||
}}
|
||||
>
|
||||
All
|
||||
</button>
|
||||
<button
|
||||
onClick={() => setStateFilter('AZ')}
|
||||
style={{
|
||||
padding: '6px 14px',
|
||||
background: stateFilter === 'AZ' ? '#2563eb' : 'white',
|
||||
color: stateFilter === 'AZ' ? 'white' : '#374151',
|
||||
border: 'none',
|
||||
borderLeft: '1px solid #d1d5db',
|
||||
cursor: 'pointer',
|
||||
fontSize: '14px',
|
||||
fontWeight: '500'
|
||||
}}
|
||||
>
|
||||
AZ Only
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Search Box */}
|
||||
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
|
||||
<span style={{ fontWeight: '500', color: '#374151' }}>Search:</span>
|
||||
<input
|
||||
type="text"
|
||||
placeholder="Store name or slug..."
|
||||
value={searchInput}
|
||||
onChange={(e) => setSearchInput(e.target.value)}
|
||||
style={{
|
||||
padding: '6px 12px',
|
||||
borderRadius: '6px',
|
||||
border: '1px solid #d1d5db',
|
||||
fontSize: '14px',
|
||||
width: '200px'
|
||||
}}
|
||||
/>
|
||||
{searchInput && (
|
||||
<button
|
||||
onClick={() => { setSearchInput(''); setSearchTerm(''); }}
|
||||
style={{
|
||||
padding: '4px 8px',
|
||||
background: '#f3f4f6',
|
||||
border: '1px solid #d1d5db',
|
||||
borderRadius: '4px',
|
||||
cursor: 'pointer',
|
||||
fontSize: '12px'
|
||||
}}
|
||||
>
|
||||
Clear
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Dutchie Only Checkbox */}
|
||||
<label style={{ display: 'flex', alignItems: 'center', gap: '8px', cursor: 'pointer' }}>
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={filterDutchieOnly}
|
||||
onChange={(e) => setFilterDutchieOnly(e.target.checked)}
|
||||
style={{ width: '16px', height: '16px', cursor: 'pointer' }}
|
||||
/>
|
||||
<span>Dutchie only</span>
|
||||
</label>
|
||||
|
||||
{/* Results Count */}
|
||||
<span style={{ color: '#666', fontSize: '14px', marginLeft: 'auto' }}>
|
||||
Showing {(filterDutchieOnly
|
||||
? dispensarySchedules.filter(d => d.menu_type === 'dutchie')
|
||||
: dispensarySchedules
|
||||
).length} dispensaries
|
||||
</span>
|
||||
</div>
|
||||
<div style={{
|
||||
background: 'white',
|
||||
borderRadius: '8px',
|
||||
boxShadow: '0 2px 8px rgba(0,0,0,0.1)',
|
||||
overflow: 'hidden'
|
||||
overflow: 'auto'
|
||||
}}>
|
||||
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
|
||||
<table style={{ width: '100%', borderCollapse: 'collapse', minWidth: '1200px' }}>
|
||||
<thead>
|
||||
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
|
||||
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Provider</th>
|
||||
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Schedule</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Run</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Next Run</th>
|
||||
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Result</th>
|
||||
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Actions</th>
|
||||
<th style={{ padding: '12px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
|
||||
<th style={{ padding: '12px', textAlign: 'center', fontWeight: '600' }}>Menu Type</th>
|
||||
<th style={{ padding: '12px', textAlign: 'center', fontWeight: '600' }}>Platform ID</th>
|
||||
<th style={{ padding: '12px', textAlign: 'center', fontWeight: '600' }}>Status</th>
|
||||
<th style={{ padding: '12px', textAlign: 'left', fontWeight: '600' }}>Last Run</th>
|
||||
<th style={{ padding: '12px', textAlign: 'left', fontWeight: '600' }}>Next Run</th>
|
||||
<th style={{ padding: '12px', textAlign: 'left', fontWeight: '600' }}>Last Result</th>
|
||||
<th style={{ padding: '12px', textAlign: 'center', fontWeight: '600', minWidth: '220px' }}>Actions</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{dispensarySchedules.map((disp) => (
|
||||
{(filterDutchieOnly
|
||||
? dispensarySchedules.filter(d => d.menu_type === 'dutchie')
|
||||
: dispensarySchedules
|
||||
).map((disp) => (
|
||||
<tr key={disp.dispensary_id} style={{ borderBottom: '1px solid #eee' }}>
|
||||
<td style={{ padding: '15px' }}>
|
||||
<td style={{ padding: '12px' }}>
|
||||
<div style={{ display: 'flex', alignItems: 'center', gap: '8px' }}>
|
||||
{disp.state && disp.city && disp.slug ? (
|
||||
{disp.state && disp.city && (disp.dispensary_slug || disp.slug) ? (
|
||||
<Link
|
||||
to={`/dispensaries/${disp.state}/${disp.city.toLowerCase().replace(/\s+/g, '-')}/${disp.slug}`}
|
||||
to={`/dispensaries/${disp.state}/${disp.city.toLowerCase().replace(/\s+/g, '-')}/${disp.dispensary_slug || disp.slug}`}
|
||||
style={{
|
||||
fontWeight: '600',
|
||||
color: '#2563eb',
|
||||
@@ -411,66 +595,86 @@ export function ScraperSchedule() {
|
||||
<span style={{ fontWeight: '600' }}>{disp.dispensary_name}</span>
|
||||
)}
|
||||
</div>
|
||||
<div style={{ fontSize: '13px', color: '#666' }}>
|
||||
<div style={{ fontSize: '12px', color: '#666' }}>
|
||||
{disp.city ? `${disp.city}, ${disp.state}` : disp.state}
|
||||
</div>
|
||||
</td>
|
||||
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||
{(disp.product_provider || disp.provider_type) && disp.product_provider !== 'unknown' && disp.provider_type !== 'unknown' ? (
|
||||
<div>
|
||||
{/* Menu Type Column */}
|
||||
<td style={{ padding: '12px', textAlign: 'center' }}>
|
||||
{disp.menu_type ? (
|
||||
<span style={{
|
||||
padding: '4px 10px',
|
||||
borderRadius: '12px',
|
||||
fontSize: '12px',
|
||||
fontSize: '11px',
|
||||
fontWeight: '600',
|
||||
background: disp.product_crawler_mode === 'production' ? '#d1fae5' : '#fef3c7',
|
||||
color: disp.product_crawler_mode === 'production' ? '#065f46' : '#92400e'
|
||||
background: disp.menu_type === 'dutchie' ? '#d1fae5' : '#e0e7ff',
|
||||
color: disp.menu_type === 'dutchie' ? '#065f46' : '#3730a3'
|
||||
}}>
|
||||
{disp.product_provider || disp.provider_type}
|
||||
</span>
|
||||
{disp.product_crawler_mode !== 'production' && (
|
||||
<div style={{ fontSize: '10px', color: '#92400e', marginTop: '2px' }}>sandbox</div>
|
||||
)}
|
||||
</div>
|
||||
) : disp.menu_url ? (
|
||||
<span style={{
|
||||
padding: '4px 10px',
|
||||
borderRadius: '12px',
|
||||
fontSize: '12px',
|
||||
fontWeight: '600',
|
||||
background: '#dbeafe',
|
||||
color: '#1e40af'
|
||||
}}>
|
||||
Pending
|
||||
{disp.menu_type}
|
||||
</span>
|
||||
) : (
|
||||
<span style={{
|
||||
padding: '4px 10px',
|
||||
borderRadius: '12px',
|
||||
fontSize: '12px',
|
||||
fontSize: '11px',
|
||||
fontWeight: '600',
|
||||
background: '#f3f4f6',
|
||||
color: '#666'
|
||||
}}>
|
||||
-
|
||||
unknown
|
||||
</span>
|
||||
)}
|
||||
</td>
|
||||
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||
{/* Platform ID Column */}
|
||||
<td style={{ padding: '12px', textAlign: 'center' }}>
|
||||
{disp.platform_dispensary_id ? (
|
||||
<span style={{
|
||||
padding: '4px 8px',
|
||||
borderRadius: '4px',
|
||||
fontSize: '10px',
|
||||
fontFamily: 'monospace',
|
||||
background: '#d1fae5',
|
||||
color: '#065f46'
|
||||
}} title={disp.platform_dispensary_id}>
|
||||
{disp.platform_dispensary_id.length > 12
|
||||
? `${disp.platform_dispensary_id.slice(0, 6)}...${disp.platform_dispensary_id.slice(-4)}`
|
||||
: disp.platform_dispensary_id}
|
||||
</span>
|
||||
) : (
|
||||
<span style={{
|
||||
padding: '4px 8px',
|
||||
borderRadius: '4px',
|
||||
fontSize: '10px',
|
||||
background: '#fee2e2',
|
||||
color: '#991b1b'
|
||||
}}>
|
||||
missing
|
||||
</span>
|
||||
)}
|
||||
</td>
|
||||
{/* Status Column - Shows can_crawl and reason */}
|
||||
<td style={{ padding: '12px', textAlign: 'center' }}>
|
||||
<div style={{ display: 'flex', flexDirection: 'column', alignItems: 'center', gap: '4px' }}>
|
||||
<span style={{
|
||||
padding: '4px 10px',
|
||||
borderRadius: '12px',
|
||||
fontSize: '12px',
|
||||
fontSize: '11px',
|
||||
fontWeight: '600',
|
||||
background: disp.schedule_active ? '#d1fae5' : '#fee2e2',
|
||||
color: disp.schedule_active ? '#065f46' : '#991b1b'
|
||||
background: disp.can_crawl ? '#d1fae5' : (disp.is_active !== false ? '#fef3c7' : '#fee2e2'),
|
||||
color: disp.can_crawl ? '#065f46' : (disp.is_active !== false ? '#92400e' : '#991b1b')
|
||||
}}>
|
||||
{disp.schedule_active ? 'Active' : 'Disabled'}
|
||||
{disp.can_crawl ? 'Ready' : (disp.is_active !== false ? 'Not Ready' : 'Disabled')}
|
||||
</span>
|
||||
<span style={{ fontSize: '12px', color: '#666' }}>
|
||||
{disp.schedule_status_reason && disp.schedule_status_reason !== 'ready' && (
|
||||
<span style={{ fontSize: '10px', color: '#666', maxWidth: '100px', textAlign: 'center' }}>
|
||||
{disp.schedule_status_reason}
|
||||
</span>
|
||||
)}
|
||||
{disp.interval_minutes && (
|
||||
<span style={{ fontSize: '10px', color: '#999' }}>
|
||||
Every {Math.round(disp.interval_minutes / 60)}h
|
||||
</span>
|
||||
)}
|
||||
</div>
|
||||
</td>
|
||||
<td style={{ padding: '15px' }}>
|
||||
@@ -530,28 +734,91 @@ export function ScraperSchedule() {
|
||||
<span style={{ color: '#999', fontSize: '13px' }}>No runs yet</span>
|
||||
)}
|
||||
</td>
|
||||
<td style={{ padding: '15px', textAlign: 'center' }}>
|
||||
<td style={{ padding: '12px', textAlign: 'center' }}>
|
||||
<div style={{ display: 'flex', gap: '6px', justifyContent: 'center', flexWrap: 'wrap' }}>
|
||||
{/* Refresh Detection - always available */}
|
||||
<button
|
||||
onClick={() => handleRefreshDetection(disp.dispensary_id)}
|
||||
disabled={refreshingDetection === disp.dispensary_id}
|
||||
style={{
|
||||
padding: '4px 8px',
|
||||
background: refreshingDetection === disp.dispensary_id ? '#94a3b8' : '#f3f4f6',
|
||||
color: '#374151',
|
||||
border: '1px solid #d1d5db',
|
||||
borderRadius: '4px',
|
||||
cursor: refreshingDetection === disp.dispensary_id ? 'wait' : 'pointer',
|
||||
fontSize: '11px'
|
||||
}}
|
||||
title="Re-detect menu type and resolve platform ID"
|
||||
>
|
||||
{refreshingDetection === disp.dispensary_id ? '...' : 'Refresh'}
|
||||
</button>
|
||||
|
||||
{/* Resolve ID - only if dutchie and missing platform ID */}
|
||||
{disp.menu_type === 'dutchie' && !disp.platform_dispensary_id && (
|
||||
<button
|
||||
onClick={() => handleResolvePlatformId(disp.dispensary_id)}
|
||||
disabled={resolvingId === disp.dispensary_id}
|
||||
style={{
|
||||
padding: '4px 8px',
|
||||
background: resolvingId === disp.dispensary_id ? '#94a3b8' : '#fef3c7',
|
||||
color: '#92400e',
|
||||
border: '1px solid #fcd34d',
|
||||
borderRadius: '4px',
|
||||
cursor: resolvingId === disp.dispensary_id ? 'wait' : 'pointer',
|
||||
fontSize: '11px'
|
||||
}}
|
||||
title="Resolve platform dispensary ID via GraphQL"
|
||||
>
|
||||
{resolvingId === disp.dispensary_id ? '...' : 'Resolve ID'}
|
||||
</button>
|
||||
)}
|
||||
|
||||
{/* Run Now - only if can_crawl */}
|
||||
<button
|
||||
onClick={() => handleTriggerCrawl(disp.dispensary_id)}
|
||||
disabled={triggeringDispensary === disp.dispensary_id}
|
||||
disabled={triggeringDispensary === disp.dispensary_id || !disp.can_crawl}
|
||||
style={{
|
||||
padding: '6px 12px',
|
||||
background: triggeringDispensary === disp.dispensary_id ? '#94a3b8' : '#2563eb',
|
||||
color: 'white',
|
||||
padding: '4px 8px',
|
||||
background: triggeringDispensary === disp.dispensary_id ? '#94a3b8' :
|
||||
!disp.can_crawl ? '#e5e7eb' : '#2563eb',
|
||||
color: !disp.can_crawl ? '#9ca3af' : 'white',
|
||||
border: 'none',
|
||||
borderRadius: '4px',
|
||||
cursor: triggeringDispensary === disp.dispensary_id ? 'wait' : 'pointer',
|
||||
fontSize: '13px'
|
||||
cursor: triggeringDispensary === disp.dispensary_id || !disp.can_crawl ? 'not-allowed' : 'pointer',
|
||||
fontSize: '11px'
|
||||
}}
|
||||
title={disp.can_crawl ? 'Trigger immediate crawl' : `Cannot crawl: ${disp.schedule_status_reason}`}
|
||||
>
|
||||
{triggeringDispensary === disp.dispensary_id ? 'Starting...' : 'Run Now'}
|
||||
{triggeringDispensary === disp.dispensary_id ? '...' : 'Run'}
|
||||
</button>
|
||||
|
||||
{/* Enable/Disable Schedule Toggle */}
|
||||
<button
|
||||
onClick={() => handleToggleSchedule(disp.dispensary_id, disp.is_active)}
|
||||
disabled={togglingSchedule === disp.dispensary_id}
|
||||
style={{
|
||||
padding: '4px 8px',
|
||||
background: togglingSchedule === disp.dispensary_id ? '#94a3b8' :
|
||||
disp.is_active ? '#fee2e2' : '#d1fae5',
|
||||
color: disp.is_active ? '#991b1b' : '#065f46',
|
||||
border: 'none',
|
||||
borderRadius: '4px',
|
||||
cursor: togglingSchedule === disp.dispensary_id ? 'wait' : 'pointer',
|
||||
fontSize: '11px'
|
||||
}}
|
||||
title={disp.is_active ? 'Disable scheduled crawling' : 'Enable scheduled crawling'}
|
||||
>
|
||||
{togglingSchedule === disp.dispensary_id ? '...' : (disp.is_active ? 'Disable' : 'Enable')}
|
||||
</button>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{activeTab === 'jobs' && (
|
||||
|
||||
Reference in New Issue
Block a user