From 199b6a8a237531e56ece6ebd1e8e3b8703449aa1 Mon Sep 17 00:00:00 2001 From: Kelly Date: Mon, 1 Dec 2025 08:52:54 -0700 Subject: [PATCH] Remove incorrect migration 029, add snapshot architecture, improve scraper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Delete migration 029 that was incorrectly creating duplicate dispensaries - Add migration 028 for snapshot architecture - Improve downloader with proxy/UA rotation - Update scraper monitor and tools pages - Various scraper improvements 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../migrations/028_snapshot_architecture.sql | 240 +++++++++++++++++ ...29_link_dutchie_stores_to_dispensaries.sql | 56 ---- backend/src/routes/scraper-monitor.ts | 117 +++++---- backend/src/scraper-v2/downloader.ts | 245 ++++++++++++++++-- backend/src/scraper-v2/middlewares.ts | 148 ++++++++--- backend/src/scraper-v2/pipelines.ts | 42 ++- backend/src/scripts/queue-dispensaries.ts | 2 +- backend/src/scripts/queue-intelligence.ts | 2 +- backend/src/services/crawler-jobs.ts | 2 +- backend/src/utils/minio.ts | 17 +- frontend/src/pages/ScraperMonitor.tsx | 43 ++- frontend/src/pages/ScraperTools.tsx | 187 +++---------- 12 files changed, 760 insertions(+), 341 deletions(-) create mode 100644 backend/migrations/028_snapshot_architecture.sql delete mode 100644 backend/migrations/029_link_dutchie_stores_to_dispensaries.sql diff --git a/backend/migrations/028_snapshot_architecture.sql b/backend/migrations/028_snapshot_architecture.sql new file mode 100644 index 00000000..54a8a78d --- /dev/null +++ b/backend/migrations/028_snapshot_architecture.sql @@ -0,0 +1,240 @@ +-- Migration 028: Snapshot Architecture +-- Implements append-only snapshots for full history tracking +-- Following the principle: "Never delete, only append observations" + +-- ===================================================== +-- LAYER 1: Raw Append-Only Snapshots (NEVER DELETE) +-- ===================================================== + +-- Product snapshots - one row per product per crawl +CREATE TABLE IF NOT EXISTS product_snapshots ( + id SERIAL PRIMARY KEY, + + -- Source identification + crawl_id UUID NOT NULL, -- Groups all products from same crawl run + dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), + + -- Product identification + external_product_id VARCHAR(255), -- Dutchie/provider product ID + product_slug VARCHAR(500), -- URL slug for matching + + -- Product details (as seen at crawl time) + name VARCHAR(500) NOT NULL, + brand VARCHAR(255), + category VARCHAR(100), + subcategory VARCHAR(100), + + -- Pricing snapshot + price NUMERIC(10,2), + original_price NUMERIC(10,2), + sale_price NUMERIC(10,2), + discount_type VARCHAR(50), + discount_value VARCHAR(100), + + -- Availability snapshot + availability_status VARCHAR(30) NOT NULL DEFAULT 'unknown', + -- 'in_stock', 'out_of_stock', 'limited', 'removed_from_menu', 'unknown' + stock_quantity INTEGER, + + -- Potency snapshot + thc_percentage NUMERIC(5,2), + cbd_percentage NUMERIC(5,2), + + -- Product attributes + strain_type VARCHAR(100), + weight VARCHAR(100), + variant VARCHAR(255), + + -- Rich data + description TEXT, + image_url TEXT, + effects TEXT[], + terpenes TEXT[], + + -- Timestamp + captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + + -- Raw metadata from provider + raw_data JSONB +); + +-- Indexes for efficient querying +CREATE INDEX IF NOT EXISTS idx_snapshots_dispensary_time ON product_snapshots(dispensary_id, captured_at DESC); +CREATE INDEX IF NOT EXISTS idx_snapshots_crawl ON product_snapshots(crawl_id); +CREATE INDEX IF NOT EXISTS idx_snapshots_brand ON product_snapshots(brand, captured_at DESC); +CREATE INDEX IF NOT EXISTS idx_snapshots_product_slug ON product_snapshots(product_slug, captured_at DESC); +CREATE INDEX IF NOT EXISTS idx_snapshots_external_id ON product_snapshots(external_product_id, captured_at DESC); +CREATE INDEX IF NOT EXISTS idx_snapshots_availability ON product_snapshots(availability_status, captured_at DESC); +CREATE INDEX IF NOT EXISTS idx_snapshots_category ON product_snapshots(category, captured_at DESC); + +-- Brand snapshots - summary of brands seen per crawl +CREATE TABLE IF NOT EXISTS brand_snapshots ( + id SERIAL PRIMARY KEY, + crawl_id UUID NOT NULL, + dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), + + brand_name VARCHAR(255) NOT NULL, + product_count INTEGER NOT NULL DEFAULT 0, + in_stock_count INTEGER NOT NULL DEFAULT 0, + + -- Price range for this brand at this store at this time + min_price NUMERIC(10,2), + max_price NUMERIC(10,2), + avg_price NUMERIC(10,2), + + -- Categories this brand has products in + categories TEXT[], + + captured_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_brand_snapshots_dispensary ON brand_snapshots(dispensary_id, captured_at DESC); +CREATE INDEX IF NOT EXISTS idx_brand_snapshots_brand ON brand_snapshots(brand_name, captured_at DESC); +CREATE INDEX IF NOT EXISTS idx_brand_snapshots_crawl ON brand_snapshots(crawl_id); + +-- Crawl runs table - metadata about each crawl +CREATE TABLE IF NOT EXISTS crawl_runs ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), + + started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + completed_at TIMESTAMPTZ, + + status VARCHAR(20) NOT NULL DEFAULT 'running', -- 'running', 'completed', 'failed' + + -- Results + products_found INTEGER DEFAULT 0, + brands_found INTEGER DEFAULT 0, + categories_found INTEGER DEFAULT 0, + + -- Errors if any + error_message TEXT, + + -- Provider info + provider VARCHAR(50), + menu_url TEXT +); + +CREATE INDEX IF NOT EXISTS idx_crawl_runs_dispensary ON crawl_runs(dispensary_id, started_at DESC); +CREATE INDEX IF NOT EXISTS idx_crawl_runs_status ON crawl_runs(status); + +-- ===================================================== +-- LAYER 2: Summary/Rollup Tables (can be recalculated) +-- ===================================================== + +-- Daily brand summary per store +CREATE TABLE IF NOT EXISTS brand_store_day_summary ( + id SERIAL PRIMARY KEY, + dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), + brand_name VARCHAR(255) NOT NULL, + summary_date DATE NOT NULL, + + -- Presence + first_seen_at TIMESTAMPTZ, + last_seen_at TIMESTAMPTZ, + crawl_count INTEGER DEFAULT 0, -- how many times we saw this brand today + + -- Product counts + total_skus INTEGER DEFAULT 0, + in_stock_skus INTEGER DEFAULT 0, + out_of_stock_events INTEGER DEFAULT 0, + + -- Price stats + min_price NUMERIC(10,2), + max_price NUMERIC(10,2), + avg_price NUMERIC(10,2), + + -- Categories + categories TEXT[], + + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + + UNIQUE(dispensary_id, brand_name, summary_date) +); + +CREATE INDEX IF NOT EXISTS idx_brand_store_day_dispensary ON brand_store_day_summary(dispensary_id, summary_date DESC); +CREATE INDEX IF NOT EXISTS idx_brand_store_day_brand ON brand_store_day_summary(brand_name, summary_date DESC); + +-- Product SKU daily summary +CREATE TABLE IF NOT EXISTS product_sku_day_summary ( + id SERIAL PRIMARY KEY, + dispensary_id INTEGER NOT NULL REFERENCES dispensaries(id), + product_slug VARCHAR(500) NOT NULL, + summary_date DATE NOT NULL, + + -- Latest values + name VARCHAR(500), + brand VARCHAR(255), + category VARCHAR(100), + + -- Price tracking + opening_price NUMERIC(10,2), -- first price of day + closing_price NUMERIC(10,2), -- last price of day + min_price NUMERIC(10,2), + max_price NUMERIC(10,2), + price_changes INTEGER DEFAULT 0, + + -- Availability + times_in_stock INTEGER DEFAULT 0, + times_out_of_stock INTEGER DEFAULT 0, + first_seen_at TIMESTAMPTZ, + last_seen_at TIMESTAMPTZ, + + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + + UNIQUE(dispensary_id, product_slug, summary_date) +); + +CREATE INDEX IF NOT EXISTS idx_sku_day_dispensary ON product_sku_day_summary(dispensary_id, summary_date DESC); +CREATE INDEX IF NOT EXISTS idx_sku_day_slug ON product_sku_day_summary(product_slug, summary_date DESC); + +-- ===================================================== +-- VIEWS for common queries +-- ===================================================== + +-- Current products view (latest snapshot per product) +CREATE OR REPLACE VIEW current_products AS +SELECT DISTINCT ON (ps.dispensary_id, ps.product_slug) + ps.*, + d.name AS dispensary_name, + COALESCE(d.dba_name, d.name) AS store_name +FROM product_snapshots ps +JOIN dispensaries d ON d.id = ps.dispensary_id +ORDER BY ps.dispensary_id, ps.product_slug, ps.captured_at DESC; + +-- Current brands per store view +CREATE OR REPLACE VIEW current_brands AS +SELECT DISTINCT ON (bs.dispensary_id, bs.brand_name) + bs.*, + d.name AS dispensary_name, + COALESCE(d.dba_name, d.name) AS store_name +FROM brand_snapshots bs +JOIN dispensaries d ON d.id = bs.dispensary_id +WHERE bs.captured_at >= NOW() - INTERVAL '7 days' +ORDER BY bs.dispensary_id, bs.brand_name, bs.captured_at DESC; + +-- Brand coverage across stores +CREATE OR REPLACE VIEW brand_store_coverage AS +SELECT + brand_name, + COUNT(DISTINCT dispensary_id) AS store_count, + SUM(product_count) AS total_skus, + MIN(min_price) AS market_min_price, + MAX(max_price) AS market_max_price, + AVG(avg_price) AS market_avg_price, + MAX(captured_at) AS last_seen_at +FROM brand_snapshots +WHERE captured_at >= NOW() - INTERVAL '7 days' +GROUP BY brand_name; + +-- Grant permissions +GRANT SELECT, INSERT ON product_snapshots TO scraper; +GRANT SELECT, INSERT ON brand_snapshots TO scraper; +GRANT SELECT, INSERT, UPDATE ON crawl_runs TO scraper; +GRANT SELECT, INSERT, UPDATE ON brand_store_day_summary TO scraper; +GRANT SELECT, INSERT, UPDATE ON product_sku_day_summary TO scraper; +GRANT SELECT ON current_products TO scraper; +GRANT SELECT ON current_brands TO scraper; +GRANT SELECT ON brand_store_coverage TO scraper; diff --git a/backend/migrations/029_link_dutchie_stores_to_dispensaries.sql b/backend/migrations/029_link_dutchie_stores_to_dispensaries.sql deleted file mode 100644 index 27426846..00000000 --- a/backend/migrations/029_link_dutchie_stores_to_dispensaries.sql +++ /dev/null @@ -1,56 +0,0 @@ --- ===================================================== --- Link Dutchie Stores to Dispensaries --- ===================================================== --- Creates dispensary records for stores with dutchie_url that --- don't yet have a dispensary_id, then links them. - --- Create dispensaries for unlinked stores with dutchie_url -DO $$ -DECLARE - store_rec RECORD; - new_slug TEXT; - new_disp_id INTEGER; -BEGIN - FOR store_rec IN - SELECT id, name, dutchie_url - FROM stores - WHERE dutchie_url IS NOT NULL AND dispensary_id IS NULL - LOOP - -- Extract slug from dutchie_url - new_slug := regexp_replace( - regexp_replace(store_rec.dutchie_url, '^https://dutchie\.com/(embedded-menu|dispensary)/', ''), - '/.*$', '' - ); - - -- Insert or update dispensary - INSERT INTO dispensaries (name, slug, address, city, state, provider_type, menu_url, created_at, updated_at) - VALUES ( - store_rec.name, - new_slug, - 'TBD', -- Address to be filled in later - 'TBD', -- City to be filled in later - 'AZ', -- Default state - 'dutchie', - store_rec.dutchie_url, - NOW(), - NOW() - ) - ON CONFLICT (slug) DO UPDATE SET - provider_type = 'dutchie', - menu_url = EXCLUDED.menu_url, - updated_at = NOW() - RETURNING id INTO new_disp_id; - - -- Link store to dispensary - UPDATE stores SET dispensary_id = new_disp_id WHERE id = store_rec.id; - - RAISE NOTICE 'Linked store % (%) to dispensary %', store_rec.id, store_rec.name, new_disp_id; - END LOOP; -END $$; - --- Report on linked stores -SELECT s.id as store_id, s.name as store_name, s.dispensary_id, d.slug as disp_slug -FROM stores s -JOIN dispensaries d ON d.id = s.dispensary_id -WHERE s.dutchie_url IS NOT NULL -ORDER BY s.id; diff --git a/backend/src/routes/scraper-monitor.ts b/backend/src/routes/scraper-monitor.ts index 5d4d7bb9..9a6f85a5 100644 --- a/backend/src/routes/scraper-monitor.ts +++ b/backend/src/routes/scraper-monitor.ts @@ -71,36 +71,46 @@ router.get('/active/:id', async (req, res) => { // Get scraper history (last 50 completed scrapes) router.get('/history', async (req, res) => { try { - const { limit = 50, store_id } = req.query; + const { limit = 50, dispensary_id } = req.query; let query = ` SELECT - s.id as store_id, - s.name as store_name, - c.id as category_id, - c.name as category_name, - c.last_scraped_at, + d.id as dispensary_id, + COALESCE(d.dba_name, d.name) as dispensary_name, + d.city, + d.state, + dcj.id as job_id, + dcj.job_type, + dcj.status, + dcj.products_found, + dcj.products_new, + dcj.products_updated, + dcj.in_stock_count, + dcj.out_of_stock_count, + dcj.duration_ms, + dcj.completed_at as last_scraped_at, + dcj.error_message, ( SELECT COUNT(*) FROM products p - WHERE p.store_id = s.id - AND p.category_id = c.id + WHERE p.dispensary_id = d.id + AND p.last_seen_at >= NOW() - INTERVAL '7 days' ) as product_count - FROM stores s - LEFT JOIN categories c ON c.store_id = s.id - WHERE c.last_scraped_at IS NOT NULL + FROM dispensary_crawl_jobs dcj + JOIN dispensaries d ON d.id = dcj.dispensary_id + WHERE dcj.completed_at IS NOT NULL `; const params: any[] = []; let paramCount = 1; - if (store_id) { - query += ` AND s.id = $${paramCount}`; - params.push(store_id); + if (dispensary_id) { + query += ` AND d.id = $${paramCount}`; + params.push(dispensary_id); paramCount++; } - query += ` ORDER BY c.last_scraped_at DESC LIMIT $${paramCount}`; + query += ` ORDER BY dcj.completed_at DESC LIMIT $${paramCount}`; params.push(limit); const result = await pool.query(query, params); @@ -169,7 +179,7 @@ export function completeScraper(id: string, error?: string): void { } } -// Brand scrape jobs endpoints +// Dispensary crawl jobs endpoints router.get('/jobs/stats', async (req, res) => { try { const { dispensary_id } = req.query; @@ -187,8 +197,8 @@ router.get('/jobs/stats', async (req, res) => { status, COUNT(*) as count, SUM(products_found) as total_products_found, - SUM(products_saved) as total_products_saved - FROM brand_scrape_jobs + SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved + FROM dispensary_crawl_jobs ${whereClause} GROUP BY status `, params); @@ -205,8 +215,8 @@ router.get('/jobs/stats', async (req, res) => { result.rows.forEach((row: { status: string; count: string; total_products_found?: string; total_products_saved?: string }) => { stats[row.status as keyof typeof stats] = parseInt(row.count); if (row.status === 'completed') { - stats.total_products_found = parseInt(row.total_products_found || '0'); - stats.total_products_saved = parseInt(row.total_products_saved || '0'); + stats.total_products_found += parseInt(row.total_products_found || '0'); + stats.total_products_saved += parseInt(row.total_products_saved || '0'); } }); @@ -221,31 +231,32 @@ router.get('/jobs/active', async (req, res) => { try { const { dispensary_id } = req.query; - let whereClause = "WHERE status = 'in_progress'"; + let whereClause = "WHERE dcj.status = 'in_progress'"; const params: any[] = []; let paramCount = 1; if (dispensary_id) { - whereClause += ` AND dispensary_id = $${paramCount}`; + whereClause += ` AND dcj.dispensary_id = $${paramCount}`; params.push(dispensary_id); paramCount++; } const result = await pool.query(` SELECT - id, - dispensary_id, - brand_slug, - brand_name, - status, - worker_id, - started_at, - products_found, - products_saved, - EXTRACT(EPOCH FROM (NOW() - started_at)) as duration_seconds - FROM brand_scrape_jobs + dcj.id, + dcj.dispensary_id, + COALESCE(d.dba_name, d.name) as dispensary_name, + dcj.job_type, + dcj.status, + dcj.worker_id, + dcj.started_at, + dcj.products_found, + COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved, + EXTRACT(EPOCH FROM (NOW() - dcj.started_at)) as duration_seconds + FROM dispensary_crawl_jobs dcj + JOIN dispensaries d ON d.id = dcj.dispensary_id ${whereClause} - ORDER BY started_at DESC + ORDER BY dcj.started_at DESC `, params); res.json({ jobs: result.rows }); @@ -266,13 +277,13 @@ router.get('/jobs/recent', async (req, res) => { const conditions: string[] = []; if (dispensary_id) { - conditions.push(`dispensary_id = $${paramCount}`); + conditions.push(`dcj.dispensary_id = $${paramCount}`); params.push(dispensary_id); paramCount++; } if (status) { - conditions.push(`status = $${paramCount}`); + conditions.push(`dcj.status = $${paramCount}`); params.push(status); paramCount++; } @@ -285,22 +296,22 @@ router.get('/jobs/recent', async (req, res) => { const result = await pool.query(` SELECT - id, - dispensary_id, - brand_slug, - brand_name, - status, - worker_id, - started_at, - completed_at, - products_found, - products_saved, - error_message, - retry_count, - EXTRACT(EPOCH FROM (COALESCE(completed_at, NOW()) - started_at)) as duration_seconds - FROM brand_scrape_jobs + dcj.id, + dcj.dispensary_id, + COALESCE(d.dba_name, d.name) as dispensary_name, + dcj.job_type, + dcj.status, + dcj.worker_id, + dcj.started_at, + dcj.completed_at, + dcj.products_found, + COALESCE(dcj.products_new, 0) + COALESCE(dcj.products_updated, 0) as products_saved, + dcj.error_message, + EXTRACT(EPOCH FROM (COALESCE(dcj.completed_at, NOW()) - dcj.started_at)) as duration_seconds + FROM dispensary_crawl_jobs dcj + JOIN dispensaries d ON d.id = dcj.dispensary_id ${whereClause} - ORDER BY created_at DESC + ORDER BY dcj.created_at DESC LIMIT $${paramCount} `, params); @@ -328,10 +339,10 @@ router.get('/jobs/workers', async (req, res) => { worker_id, COUNT(*) as active_jobs, SUM(products_found) as total_products_found, - SUM(products_saved) as total_products_saved, + SUM(COALESCE(products_new, 0) + COALESCE(products_updated, 0)) as total_products_saved, MIN(started_at) as earliest_start, MAX(started_at) as latest_start - FROM brand_scrape_jobs + FROM dispensary_crawl_jobs ${whereClause} GROUP BY worker_id ORDER BY worker_id diff --git a/backend/src/scraper-v2/downloader.ts b/backend/src/scraper-v2/downloader.ts index 485ae3c6..4614b408 100644 --- a/backend/src/scraper-v2/downloader.ts +++ b/backend/src/scraper-v2/downloader.ts @@ -3,16 +3,108 @@ import axios from 'axios'; import { ScraperRequest, ScraperResponse, ScraperError, ErrorType, ProxyConfig } from './types'; import { logger } from '../services/logger'; +// Fingerprint profiles for randomization +const SCREEN_RESOLUTIONS = [ + { width: 1920, height: 1080 }, + { width: 1366, height: 768 }, + { width: 1536, height: 864 }, + { width: 1440, height: 900 }, + { width: 1280, height: 720 }, + { width: 2560, height: 1440 }, + { width: 1680, height: 1050 }, + { width: 1600, height: 900 }, +]; + +const TIMEZONES = [ + 'America/New_York', + 'America/Chicago', + 'America/Denver', + 'America/Los_Angeles', + 'America/Phoenix', +]; + +const LANGUAGES = [ + ['en-US', 'en'], + ['en-US', 'en', 'es'], + ['en-US'], +]; + +const PLATFORMS = [ + 'Win32', + 'MacIntel', + 'Linux x86_64', +]; + +const WEBGL_VENDORS = [ + 'Google Inc. (NVIDIA)', + 'Google Inc. (Intel)', + 'Google Inc. (AMD)', + 'Intel Inc.', + 'NVIDIA Corporation', +]; + +const WEBGL_RENDERERS = [ + 'ANGLE (NVIDIA GeForce GTX 1080 Direct3D11 vs_5_0 ps_5_0)', + 'ANGLE (Intel(R) UHD Graphics 630 Direct3D11 vs_5_0 ps_5_0)', + 'ANGLE (AMD Radeon RX 580 Series Direct3D11 vs_5_0 ps_5_0)', + 'Intel Iris OpenGL Engine', + 'NVIDIA GeForce RTX 3070/PCIe/SSE2', + 'AMD Radeon Pro 5500M OpenGL Engine', +]; + +interface Fingerprint { + screen: { width: number; height: number }; + timezone: string; + languages: string[]; + platform: string; + hardwareConcurrency: number; + deviceMemory: number; + webglVendor: string; + webglRenderer: string; +} + +function generateRandomFingerprint(): Fingerprint { + return { + screen: SCREEN_RESOLUTIONS[Math.floor(Math.random() * SCREEN_RESOLUTIONS.length)], + timezone: TIMEZONES[Math.floor(Math.random() * TIMEZONES.length)], + languages: LANGUAGES[Math.floor(Math.random() * LANGUAGES.length)], + platform: PLATFORMS[Math.floor(Math.random() * PLATFORMS.length)], + hardwareConcurrency: [4, 8, 12, 16][Math.floor(Math.random() * 4)], + deviceMemory: [4, 8, 16, 32][Math.floor(Math.random() * 4)], + webglVendor: WEBGL_VENDORS[Math.floor(Math.random() * WEBGL_VENDORS.length)], + webglRenderer: WEBGL_RENDERERS[Math.floor(Math.random() * WEBGL_RENDERERS.length)], + }; +} + export class Downloader { private browser: Browser | null = null; private page: Page | null = null; private pageInUse: boolean = false; + private currentFingerprint: Fingerprint = generateRandomFingerprint(); + private needsNewFingerprint: boolean = false; /** - * Initialize browser instance (lazy initialization) + * Force new fingerprint on next browser creation */ - private async getBrowser(): Promise { + rotateFingerprint(): void { + this.needsNewFingerprint = true; + logger.info('scraper', '🔄 Fingerprint rotation scheduled'); + } + + /** + * Initialize browser instance with fingerprint + */ + private async getBrowser(forceNew: boolean = false): Promise { + // Create new browser if needed for fingerprint rotation + if (forceNew || this.needsNewFingerprint) { + await this.close(); + this.currentFingerprint = generateRandomFingerprint(); + this.needsNewFingerprint = false; + logger.info('scraper', `🎭 New fingerprint: ${this.currentFingerprint.screen.width}x${this.currentFingerprint.screen.height}, ${this.currentFingerprint.timezone}, ${this.currentFingerprint.platform}`); + } + if (!this.browser || !this.browser.isConnected()) { + const { screen } = this.currentFingerprint; const launchOptions: any = { headless: 'new', args: [ @@ -20,9 +112,11 @@ export class Downloader { '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', - '--window-size=1920,1080', + `--window-size=${screen.width},${screen.height}`, '--disable-web-security', - '--disable-features=IsolateOrigins,site-per-process' + '--disable-features=IsolateOrigins,site-per-process', + '--disable-infobars', + '--disable-extensions', ] }; @@ -34,52 +128,157 @@ export class Downloader { } /** - * Get or create a page instance + * Get or create a page instance with current fingerprint */ - private async getPage(): Promise { - if (!this.page || this.page.isClosed()) { - const browser = await this.getBrowser(); + private async getPage(forceNew: boolean = false): Promise { + if (!this.page || this.page.isClosed() || forceNew) { + const browser = await this.getBrowser(forceNew); this.page = await browser.newPage(); - await this.page.setViewport({ width: 1920, height: 1080 }); - logger.debug('scraper', 'New page created'); + + const { screen } = this.currentFingerprint; + await this.page.setViewport({ + width: screen.width, + height: screen.height, + deviceScaleFactor: 1, + }); + + // Apply fingerprint + await this.applyFingerprint(this.page); + logger.debug('scraper', 'New page created with fingerprint'); } return this.page; } /** - * Apply stealth mode to page + * Apply full fingerprint to page */ - private async makePageStealthy(page: Page): Promise { - await page.evaluateOnNewDocument(() => { - // @ts-ignore - runs in browser context + private async applyFingerprint(page: Page): Promise { + const fp = this.currentFingerprint; + + await page.evaluateOnNewDocument((fingerprint) => { + // Hide webdriver Object.defineProperty(navigator, 'webdriver', { get: () => false, }); - // @ts-ignore - runs in browser context - Object.defineProperty(navigator, 'plugins', { - get: () => [1, 2, 3, 4, 5], + // Spoof platform + Object.defineProperty(navigator, 'platform', { + get: () => fingerprint.platform, }); - // @ts-ignore - runs in browser context + // Spoof languages Object.defineProperty(navigator, 'languages', { - get: () => ['en-US', 'en'], + get: () => fingerprint.languages, }); - // @ts-ignore - runs in browser context + // Spoof hardware concurrency + Object.defineProperty(navigator, 'hardwareConcurrency', { + get: () => fingerprint.hardwareConcurrency, + }); + + // Spoof device memory + Object.defineProperty(navigator, 'deviceMemory', { + get: () => fingerprint.deviceMemory, + }); + + // Spoof plugins (realistic count) + Object.defineProperty(navigator, 'plugins', { + get: () => { + const plugins: any = []; + for (let i = 0; i < 5; i++) { + plugins.push({ + name: `Plugin ${i}`, + filename: `plugin${i}.dll`, + description: `Description ${i}`, + }); + } + plugins.length = 5; + return plugins; + }, + }); + + // Chrome object (window as any).chrome = { runtime: {}, + loadTimes: () => ({}), + csi: () => ({}), + app: {}, }; - // @ts-ignore - runs in browser context + // Permissions const originalQuery = window.navigator.permissions.query; - // @ts-ignore - runs in browser context window.navigator.permissions.query = (parameters: any) => parameters.name === 'notifications' ? Promise.resolve({ state: 'denied' } as any) : originalQuery(parameters); - }); + + // WebGL fingerprint spoofing + const getParameterProxyHandler = { + apply: function(target: any, thisArg: any, argumentsList: any) { + const param = argumentsList[0]; + // UNMASKED_VENDOR_WEBGL + if (param === 37445) { + return fingerprint.webglVendor; + } + // UNMASKED_RENDERER_WEBGL + if (param === 37446) { + return fingerprint.webglRenderer; + } + return Reflect.apply(target, thisArg, argumentsList); + } + }; + + // Override WebGL + const originalGetContext = HTMLCanvasElement.prototype.getContext; + (HTMLCanvasElement.prototype as any).getContext = function(this: HTMLCanvasElement, type: string, ...args: any[]) { + const context = originalGetContext.call(this, type, ...args); + if (context && (type === 'webgl' || type === 'webgl2' || type === 'experimental-webgl')) { + const glContext = context as WebGLRenderingContext; + const originalGetParameter = glContext.getParameter.bind(glContext); + (glContext as any).getParameter = new Proxy(originalGetParameter, getParameterProxyHandler); + } + return context; + }; + + // Canvas fingerprint noise + const originalToDataURL = HTMLCanvasElement.prototype.toDataURL; + HTMLCanvasElement.prototype.toDataURL = function(type?: string) { + const context = this.getContext('2d'); + if (context) { + const imageData = context.getImageData(0, 0, this.width, this.height); + for (let i = 0; i < imageData.data.length; i += 4) { + // Add tiny noise to RGB values + imageData.data[i] = imageData.data[i] ^ (Math.random() > 0.5 ? 1 : 0); + } + context.putImageData(imageData, 0, 0); + } + return originalToDataURL.call(this, type); + }; + + // Screen dimensions + Object.defineProperty(window.screen, 'width', { get: () => fingerprint.screen.width }); + Object.defineProperty(window.screen, 'height', { get: () => fingerprint.screen.height }); + Object.defineProperty(window.screen, 'availWidth', { get: () => fingerprint.screen.width }); + Object.defineProperty(window.screen, 'availHeight', { get: () => fingerprint.screen.height - 40 }); + Object.defineProperty(window, 'innerWidth', { get: () => fingerprint.screen.width }); + Object.defineProperty(window, 'innerHeight', { get: () => fingerprint.screen.height - 140 }); + Object.defineProperty(window, 'outerWidth', { get: () => fingerprint.screen.width }); + Object.defineProperty(window, 'outerHeight', { get: () => fingerprint.screen.height }); + + }, fp); + + // Set timezone via CDP + const client = await page.target().createCDPSession(); + await client.send('Emulation.setTimezoneOverride', { timezoneId: fp.timezone }); + } + + /** + * Apply stealth mode to page (legacy - now uses applyFingerprint) + */ + private async makePageStealthy(page: Page): Promise { + // Now handled by applyFingerprint + await this.applyFingerprint(page); } /** diff --git a/backend/src/scraper-v2/middlewares.ts b/backend/src/scraper-v2/middlewares.ts index c5129a49..d3de1a95 100644 --- a/backend/src/scraper-v2/middlewares.ts +++ b/backend/src/scraper-v2/middlewares.ts @@ -1,13 +1,32 @@ import { Middleware, ScraperRequest, ScraperResponse, ScraperError, ErrorType, ProxyConfig } from './types'; import { logger } from '../services/logger'; import { pool } from '../db/migrate'; +import { getActiveProxy, putProxyInTimeout, isBotDetectionError } from '../services/proxy'; +// Diverse, realistic user agents - updated for 2024/2025 const USER_AGENTS = [ + // Chrome on Windows (most common) 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', + // Chrome on Mac 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + // Chrome on Linux 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', + // Firefox 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0', - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15' + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.0; rv:121.0) Gecko/20100101 Firefox/121.0', + // Safari + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15', + // Edge + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0', ]; function getRandomUserAgent(): string { @@ -19,59 +38,90 @@ function sleep(ms: number): Promise { } /** - * User Agent Rotation Middleware + * User Agent Rotation Middleware - rotates UA on each request for better evasion */ export class UserAgentMiddleware implements Middleware { name = 'UserAgentMiddleware'; priority = 100; + private lastUserAgent: string | null = null; + async processRequest(request: ScraperRequest): Promise { - if (!request.metadata.userAgent) { - request.metadata.userAgent = getRandomUserAgent(); + // Always rotate UA on retries or bot detection + const forceRotation = request.retryCount > 0 || request.metadata.botDetected; + + if (!request.metadata.userAgent || forceRotation) { + // Get a different UA than the last one used + let newUA = getRandomUserAgent(); + let attempts = 0; + while (newUA === this.lastUserAgent && attempts < 5) { + newUA = getRandomUserAgent(); + attempts++; + } + request.metadata.userAgent = newUA; + this.lastUserAgent = newUA; + + if (forceRotation) { + logger.debug('scraper', `🔄 Rotated User-Agent: ${newUA.substring(0, 50)}...`); + } } return request; } } /** - * Proxy Rotation Middleware + * Proxy Rotation Middleware - uses the central proxy service with timeout handling */ export class ProxyMiddleware implements Middleware { name = 'ProxyMiddleware'; priority = 90; - private async getActiveProxy(): Promise { - try { - const result = await pool.query(` - SELECT host, port, protocol, username, password - FROM proxies - WHERE active = true AND is_anonymous = true - ORDER BY RANDOM() - LIMIT 1 - `); - - if (result.rows.length === 0) { - return null; - } - - return result.rows[0]; - } catch (error) { - logger.error('scraper', `Failed to get proxy: ${error}`); - return null; - } - } + private currentProxyId: number | null = null; async processRequest(request: ScraperRequest): Promise { - // Only add proxy if not already set - if (!request.metadata.proxy && request.retryCount > 0) { - // Use proxy on retries - request.metadata.proxy = await this.getActiveProxy(); - if (request.metadata.proxy) { - logger.debug('scraper', `Using proxy for retry: ${request.metadata.proxy.host}:${request.metadata.proxy.port}`); + // Always try to use a proxy from the central proxy service + // The service handles bot detection timeouts automatically + const forceRotation = request.retryCount > 0 || request.metadata.botDetected; + + if (!request.metadata.proxy || forceRotation) { + // Get proxy from central service - it handles timeouts automatically + const proxy = await getActiveProxy(); + if (proxy) { + request.metadata.proxy = { + host: proxy.host, + port: proxy.port, + protocol: proxy.protocol, + username: proxy.username, + password: proxy.password, + }; + request.metadata.proxyId = proxy.id; + this.currentProxyId = proxy.id; + const reason = forceRotation ? 'rotation' : 'initial'; + logger.info('scraper', `🔄 Using proxy (${reason}): ${proxy.protocol}://${proxy.host}:${proxy.port}`); + } else { + logger.warn('scraper', '⚠️ No proxy available - running without proxy'); } } return request; } + + async processResponse(response: ScraperResponse): Promise { + // If bot detection was triggered, put the proxy in timeout + if (response.request.metadata.botDetected && response.request.metadata.proxyId) { + putProxyInTimeout(response.request.metadata.proxyId, 'Bot detection triggered'); + logger.info('scraper', `🚫 Proxy ${response.request.metadata.proxyId} put in timeout due to bot detection`); + } + return response; + } + + async processError(error: Error, request: ScraperRequest): Promise { + // If bot detection error, put proxy in timeout + if (isBotDetectionError(error.message) && request.metadata.proxyId) { + putProxyInTimeout(request.metadata.proxyId, error.message); + logger.info('scraper', `🚫 Proxy ${request.metadata.proxyId} put in timeout: ${error.message}`); + } + return error; + } } /** @@ -194,7 +244,7 @@ export class RetryMiddleware implements Middleware { } /** - * Bot Detection Middleware + * Bot Detection Middleware - detects bot blocking and triggers fingerprint rotation */ export class BotDetectionMiddleware implements Middleware { name = 'BotDetectionMiddleware'; @@ -203,6 +253,9 @@ export class BotDetectionMiddleware implements Middleware { private detectedCount: number = 0; private readonly DETECTION_THRESHOLD = 3; + // Export for use by other middlewares + static shouldRotateFingerprint: boolean = false; + async processResponse(response: ScraperResponse): Promise { const content = typeof response.content === 'string' ? response.content @@ -215,17 +268,29 @@ export class BotDetectionMiddleware implements Middleware { /access denied/i, /you have been blocked/i, /unusual traffic/i, - /robot/i + /robot/i, + /verify.*human/i, + /security check/i, + /please wait/i, + /checking your browser/i, + /ray id/i ]; const detected = botIndicators.some(pattern => pattern.test(content)); if (detected) { this.detectedCount++; + BotDetectionMiddleware.shouldRotateFingerprint = true; + logger.warn('scraper', `Bot detection suspected (${this.detectedCount}/${this.DETECTION_THRESHOLD}): ${response.url}`); + logger.info('scraper', '🔄 Flagging for proxy/UA rotation on next request'); + + // Mark the request for rotation on retry + response.request.metadata.botDetected = true; + response.request.metadata.needsNewBrowser = true; if (this.detectedCount >= this.DETECTION_THRESHOLD) { - const error: ScraperError = new Error('Bot detection threshold reached') as ScraperError; + const error: ScraperError = new Error('Bot detection threshold reached - rotating fingerprint') as ScraperError; error.type = ErrorType.BOT_DETECTION; error.retryable = true; error.request = response.request; @@ -234,10 +299,25 @@ export class BotDetectionMiddleware implements Middleware { } else { // Gradually decrease detection count on successful requests this.detectedCount = Math.max(0, this.detectedCount - 0.5); + BotDetectionMiddleware.shouldRotateFingerprint = false; } return response; } + + async processError(error: Error, request: ScraperRequest): Promise { + // If bot detection error, flag for rotation and allow retry + if ('type' in error && (error as ScraperError).type === ErrorType.BOT_DETECTION) { + request.metadata.botDetected = true; + request.metadata.needsNewBrowser = true; + logger.info('scraper', '🔄 Bot detection error - will rotate proxy/UA on retry'); + + // Add delay before retry to avoid rate limiting + await sleep(5000 + Math.random() * 5000); + return null; // Return null to trigger retry + } + return error; + } } /** diff --git a/backend/src/scraper-v2/pipelines.ts b/backend/src/scraper-v2/pipelines.ts index 9d15cec8..5f4ed3b4 100644 --- a/backend/src/scraper-v2/pipelines.ts +++ b/backend/src/scraper-v2/pipelines.ts @@ -154,6 +154,17 @@ export class ImagePipeline implements ItemPipeline { } } +/** + * Generate a URL-safe slug from a product name + */ +function generateSlug(name: string): string { + return name + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .substring(0, 400); +} + /** * Database Pipeline - saves items to database */ @@ -168,6 +179,10 @@ export class DatabasePipeline implements ItemPipeline { // Extract store and category from metadata (set by spider) const storeId = (item as any).storeId; const categoryId = (item as any).categoryId; + const dispensaryId = (item as any).dispensaryId; + + // Generate slug from name + const slug = generateSlug(item.name); if (!storeId || !categoryId) { logger.error('pipeline', `Missing storeId or categoryId for ${item.name}`); @@ -195,13 +210,13 @@ export class DatabasePipeline implements ItemPipeline { strain_type = $4, thc_percentage = $5, cbd_percentage = $6, brand = $7, weight = $8, image_url = $9, dutchie_url = $10, in_stock = true, metadata = $11, last_seen_at = CURRENT_TIMESTAMP, - updated_at = CURRENT_TIMESTAMP + updated_at = CURRENT_TIMESTAMP, dispensary_id = $13, slug = COALESCE(slug, $14) WHERE id = $12 `, [ item.name, item.description, item.price, item.strainType, item.thcPercentage, item.cbdPercentage, item.brand, item.weight, item.imageUrl, item.dutchieUrl, - JSON.stringify(item.metadata || {}), productId + JSON.stringify(item.metadata || {}), productId, dispensaryId, slug ]); logger.debug('pipeline', `Updated product: ${item.name}`); @@ -209,13 +224,13 @@ export class DatabasePipeline implements ItemPipeline { // Insert new product const insertResult = await client.query(` INSERT INTO products ( - store_id, category_id, dutchie_product_id, name, description, + store_id, category_id, dispensary_id, dutchie_product_id, slug, name, description, price, strain_type, thc_percentage, cbd_percentage, brand, weight, image_url, dutchie_url, in_stock, metadata - ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, true, $14) + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, true, $16) RETURNING id `, [ - storeId, categoryId, item.dutchieProductId, item.name, item.description, + storeId, categoryId, dispensaryId, item.dutchieProductId, slug, item.name, item.description, item.price, item.strainType, item.thcPercentage, item.cbdPercentage, item.brand, item.weight, item.imageUrl, item.dutchieUrl, JSON.stringify(item.metadata || {}) @@ -228,12 +243,19 @@ export class DatabasePipeline implements ItemPipeline { // Download image if needed if (item.imageUrl && !localImagePath) { try { - localImagePath = await uploadImageFromUrl(item.imageUrl, productId); + // Get store slug for organized image storage + const storeResult = await client.query( + 'SELECT slug FROM stores WHERE id = $1', + [storeId] + ); + const storeSlug = storeResult.rows[0]?.slug || undefined; + + const imageSizes = await uploadImageFromUrl(item.imageUrl, productId, storeSlug); + // Use thumbnail path for local_image_path + localImagePath = imageSizes.thumbnail; await client.query(` - UPDATE products - SET local_image_path = $1 - WHERE id = $2 - `, [localImagePath, productId]); + UPDATE products SET local_image_path = $1 WHERE id = $2 + `, [imageSizes.thumbnail, productId]); logger.debug('pipeline', `Downloaded image for: ${item.name}`); } catch (error) { logger.error('pipeline', `Failed to download image for ${item.name}: ${error}`); diff --git a/backend/src/scripts/queue-dispensaries.ts b/backend/src/scripts/queue-dispensaries.ts index 354ed6b9..81e8104a 100644 --- a/backend/src/scripts/queue-dispensaries.ts +++ b/backend/src/scripts/queue-dispensaries.ts @@ -176,7 +176,7 @@ async function queueProductionCrawls(): Promise { SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50, jsonb_build_object('dispensary_id', $1, 'source', 'queue-dispensaries') FROM stores s - JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%') + JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%') WHERE d.id = $1 LIMIT 1`, [dispensary.id] diff --git a/backend/src/scripts/queue-intelligence.ts b/backend/src/scripts/queue-intelligence.ts index 8f461996..04ede9e9 100644 --- a/backend/src/scripts/queue-intelligence.ts +++ b/backend/src/scripts/queue-intelligence.ts @@ -221,7 +221,7 @@ async function queueCategoryProductionCrawls(category?: IntelligenceCategory): P SELECT s.id, 'full_crawl', 'scheduled', 'pending', 50, jsonb_build_object('dispensary_id', $1, 'category', $2, 'source', 'queue-intelligence') FROM stores s - JOIN dispensaries d ON (d.menu_url = s.dutchie_plus_url OR d.name ILIKE '%' || s.name || '%') + JOIN dispensaries d ON (d.menu_url = s.dutchie_url OR d.name ILIKE '%' || s.name || '%') WHERE d.id = $1 LIMIT 1`, [dispensary.id, cat] diff --git a/backend/src/services/crawler-jobs.ts b/backend/src/services/crawler-jobs.ts index 7b4efbf5..383c724f 100644 --- a/backend/src/services/crawler-jobs.ts +++ b/backend/src/services/crawler-jobs.ts @@ -131,7 +131,7 @@ async function getStoreIdForDispensary(dispensaryId: number): Promise { +export async function uploadImageFromUrl( + imageUrl: string, + productId: number, + storeSlug?: string, + removeBackgrounds = true +): Promise { try { // Download image const response = await axios.get(imageUrl, { responseType: 'arraybuffer' }); @@ -184,8 +194,9 @@ export async function uploadImageFromUrl(imageUrl: string, productId: number, re buffer = await removeBackground(buffer); } - // Generate unique base filename - const baseFilename = `products/${productId}-${uuidv4()}`; + // Generate unique base filename - organize by store if slug provided + const storeDir = storeSlug ? `products/${storeSlug}` : 'products'; + const baseFilename = `${storeDir}/${productId}-${uuidv4()}`; // Create multiple sizes with Sharp and convert to WebP/PNG for better compression // Use PNG for images with transparency diff --git a/frontend/src/pages/ScraperMonitor.tsx b/frontend/src/pages/ScraperMonitor.tsx index 30fcea17..78081332 100644 --- a/frontend/src/pages/ScraperMonitor.tsx +++ b/frontend/src/pages/ScraperMonitor.tsx @@ -93,7 +93,7 @@ export function ScraperMonitor() { marginBottom: '-2px' }} > - Brand Scrape Jobs + Dispensary Jobs @@ -232,10 +232,10 @@ export function ScraperMonitor() {
- {job.brand_name} + {job.dispensary_name || job.brand_name}
- Worker: {job.worker_id} | Job #{job.id} + {job.job_type || 'crawl'} | Job #{job.id}
@@ -290,8 +290,8 @@ export function ScraperMonitor() { - - + + @@ -302,8 +302,8 @@ export function ScraperMonitor() { {recentJobs.map((job: any) => ( - - + + - - + + + - + {history.map((item, index) => ( - - + + + ))} diff --git a/frontend/src/pages/ScraperTools.tsx b/frontend/src/pages/ScraperTools.tsx index 6d9df50c..dbe3ac0e 100644 --- a/frontend/src/pages/ScraperTools.tsx +++ b/frontend/src/pages/ScraperTools.tsx @@ -17,61 +17,61 @@ const USER_AGENTS = { }; export function ScraperTools() { - const [stores, setStores] = useState([]); - const [selectedStore, setSelectedStore] = useState(null); + const [dispensaries, setDispensaries] = useState([]); + const [selectedDispensary, setSelectedDispensary] = useState(null); const [parallelScrapers, setParallelScrapers] = useState(3); const [selectedUserAgent, setSelectedUserAgent] = useState('rotate-desktop'); const [scraping, setScraping] = useState(false); const [downloadingImages, setDownloadingImages] = useState(false); - const [discoveringCategories, setDiscoveringCategories] = useState(false); - const [debugging, setDebugging] = useState(false); const [notification, setNotification] = useState<{ message: string; type: 'success' | 'error' | 'info' } | null>(null); const [loading, setLoading] = useState(true); useEffect(() => { - loadStores(); + loadDispensaries(); }, []); - const loadStores = async () => { + const loadDispensaries = async () => { setLoading(true); try { - const data = await api.getStores(); - setStores(data.stores); - if (data.stores.length > 0) { - setSelectedStore(data.stores[0].id); + const data = await api.getDispensaries(); + // Filter to dispensaries that have a menu_url and are scrape enabled + const scrapableDispensaries = data.dispensaries.filter((d: any) => d.menu_url && d.scrape_enabled); + setDispensaries(scrapableDispensaries); + if (scrapableDispensaries.length > 0) { + setSelectedDispensary(scrapableDispensaries[0].id); } } catch (error) { - console.error('Failed to load stores:', error); + console.error('Failed to load dispensaries:', error); } finally { setLoading(false); } }; const handleScrape = async () => { - if (!selectedStore || scraping) return; + if (!selectedDispensary || scraping) return; setScraping(true); try { - await api.scrapeStore(selectedStore, parallelScrapers, selectedUserAgent || undefined); + await api.triggerDispensaryCrawl(selectedDispensary); setNotification({ - message: `Scrape started with ${parallelScrapers} parallel scrapers using ${USER_AGENTS[selectedUserAgent as keyof typeof USER_AGENTS] || 'Random'} UA! Check the Scraper Monitor for progress.`, + message: `Crawl started for dispensary! Check the Scraper Monitor for progress.`, type: 'success' }); } catch (error: any) { - setNotification({ message: 'Failed to start scrape: ' + error.message, type: 'error' }); + setNotification({ message: 'Failed to start crawl: ' + error.message, type: 'error' }); } finally { setScraping(false); } }; const handleDownloadImages = async () => { - if (!selectedStore || downloadingImages) return; + if (!selectedDispensary || downloadingImages) return; setDownloadingImages(true); try { - const result = await api.downloadStoreImages(selectedStore); + // TODO: Implement dispensary image download endpoint setNotification({ - message: `Image download started! ${result.total_missing} missing images will be downloaded.`, + message: `Image download feature coming soon!`, type: 'info' }); } catch (error: any) { @@ -81,35 +81,7 @@ export function ScraperTools() { } }; - const handleDiscoverCategories = async () => { - if (!selectedStore || discoveringCategories) return; - - setDiscoveringCategories(true); - try { - await api.discoverStoreCategories(selectedStore); - setNotification({ message: 'Category discovery started! Check logs for progress.', type: 'info' }); - } catch (error: any) { - setNotification({ message: 'Failed to start category discovery: ' + error.message, type: 'error' }); - } finally { - setDiscoveringCategories(false); - } - }; - - const handleDebug = async () => { - if (!selectedStore || debugging) return; - - setDebugging(true); - try { - await api.debugScrapeStore(selectedStore); - setNotification({ message: 'Debug started! Check Logs page for output.', type: 'info' }); - } catch (error: any) { - setNotification({ message: 'Debug failed: ' + error.message, type: 'error' }); - } finally { - setDebugging(false); - } - }; - - const selectedStoreData = stores.find(s => s.id === selectedStore); + const selectedDispensaryData = dispensaries.find(d => d.id === selectedDispensary); if (loading) { return ( @@ -133,32 +105,32 @@ export function ScraperTools() {

Scraper Tools

-

Manage scraping operations for your stores

+

Manage crawling operations for dispensaries

- {/* Store Selection */} + {/* Dispensary Selection */}
-

Select Store

+

Select Dispensary

- {selectedStoreData && ( + {selectedDispensaryData && (
Status
- {selectedStoreData.scrape_enabled ? ( + {selectedDispensaryData.scrape_enabled ? ( Enabled ) : ( Disabled @@ -166,18 +138,18 @@ export function ScraperTools() {
-
Categories
-
{selectedStoreData.category_count || 0}
+
Provider
+
{selectedDispensaryData.provider_type || 'Unknown'}
Products
-
{selectedStoreData.product_count || 0}
+
{selectedDispensaryData.product_count || 0}
-
Last Scraped
+
Last Crawled
- {selectedStoreData.last_scraped_at - ? new Date(selectedStoreData.last_scraped_at).toLocaleDateString() + {selectedDispensaryData.last_crawl_at + ? new Date(selectedDispensaryData.last_crawl_at).toLocaleDateString() : 'Never'}
@@ -189,56 +161,21 @@ export function ScraperTools() { {/* Scraper Actions */}
- {/* Scrape Now */} + {/* Crawl Now */}
-

Scrape Store

+

Crawl Dispensary

- Start scraping products from the selected store + Start crawling products from the selected dispensary menu

-
- - setParallelScrapers(parseInt(e.target.value) || 3)} - className="input input-bordered w-full" - /> - -
- -
- - - -
-
@@ -249,13 +186,13 @@ export function ScraperTools() {

Download Images

- Download missing product images for the selected store + Download missing product images for the selected dispensary

- - {/* Discover Categories */} -
-
-

Discover Categories

-

- Automatically discover and create categories from the store -

- -
- -
-
-
- - {/* Debug Scraper */} -
-
-

Debug Scraper

-

- Run scraper in debug mode and view detailed logs -

- -
- -
-
-
{/* Quick Links */}
BrandWorkerDispensaryType Status Found Saved
{job.brand_name}{job.worker_id || '-'}{job.dispensary_name || job.brand_name}{job.job_type || '-'}
StoreCategoryDispensaryStatusFound ProductsLast ScrapedLast Crawled
{item.store_name}{item.category_name}{item.dispensary_name || item.store_name} + + {item.status || '-'} + + + {item.products_found || '-'} + {item.product_count} - {new Date(item.last_scraped_at).toLocaleString()} + {item.last_scraped_at ? new Date(item.last_scraped_at).toLocaleString() : '-'}