/** * Dutchie Location Discovery Service * * Discovers store locations from Dutchie city pages. * Each city can contain multiple dispensary locations. * * This module: * 1. Fetches location listings for a given city * 2. Upserts locations into dutchie_discovery_locations * 3. Does NOT create any canonical dispensary records * * Locations remain in "discovered" status until manually verified. */ import { Pool } from 'pg'; import axios from 'axios'; import puppeteer from 'puppeteer-extra'; import type { Browser, Page, Protocol } from 'puppeteer'; import StealthPlugin from 'puppeteer-extra-plugin-stealth'; import { DiscoveryLocation, DiscoveryLocationRow, DutchieLocationResponse, LocationDiscoveryResult, DiscoveryStatus, mapLocationRowToLocation, } from './types'; import { DiscoveryCity } from './types'; import { executeGraphQL, fetchPage, extractNextData, GRAPHQL_HASHES, setProxy, } from '../platforms/dutchie/client'; import { getStateProxy, getRandomProxy } from '../utils/proxyManager'; puppeteer.use(StealthPlugin()); // ============================================================ // PROXY INITIALIZATION // ============================================================ // Call initDiscoveryProxy() before any discovery operations to // set up proxy if USE_PROXY=true environment variable is set. // This is opt-in and does NOT break existing behavior. // ============================================================ let proxyInitialized = false; /** * Initialize proxy for discovery operations * Only runs if USE_PROXY=true is set in environment * Safe to call multiple times - only initializes once * * @param stateCode - Optional state code for state-specific proxy (e.g., 'AZ', 'CA') * @returns true if proxy was set, false if skipped or failed */ export async function initDiscoveryProxy(stateCode?: string): Promise { // Skip if already initialized if (proxyInitialized) { return true; } // Skip if USE_PROXY is not enabled if (process.env.USE_PROXY !== 'true') { console.log('[LocationDiscovery] Proxy disabled (USE_PROXY != true)'); return false; } try { // Get proxy - prefer state-specific if state code provided const proxyConfig = stateCode ? await getStateProxy(stateCode) : await getRandomProxy(); if (!proxyConfig) { console.warn('[LocationDiscovery] No proxy available, proceeding without proxy'); return false; } // Build proxy URL with auth if needed let proxyUrl = proxyConfig.server; if (proxyConfig.username && proxyConfig.password) { const url = new URL(proxyConfig.server); url.username = proxyConfig.username; url.password = proxyConfig.password; proxyUrl = url.toString(); } // Set proxy on the Dutchie client setProxy(proxyUrl); proxyInitialized = true; console.log(`[LocationDiscovery] Proxy initialized for ${stateCode || 'general'} discovery`); return true; } catch (error: any) { console.error(`[LocationDiscovery] Failed to initialize proxy: ${error.message}`); return false; } } /** * Reset proxy initialization flag (for testing or re-initialization) */ export function resetProxyInit(): void { proxyInitialized = false; setProxy(null); } const PLATFORM = 'dutchie'; // ============================================================ // CITY-BASED DISCOVERY (CANONICAL SOURCE OF TRUTH) // ============================================================ // GraphQL with city+state filter is the SOURCE OF TRUTH for database data. // // Method: // 1. Get city list from statesWithDispensaries (in __NEXT_DATA__) // 2. Query stores per city using city + state GraphQL filter // 3. This gives us complete, accurate dispensary data // // Geo-coordinate queries (nearLat/nearLng) are ONLY for showing search // results to users (e.g., "stores within 20 miles of me"). // They are NOT a source of truth for establishing database records. // ============================================================ /** * State with dispensary cities from Dutchie's statesWithDispensaries data */ export interface StateWithCities { name: string; // State code (e.g., "CA", "AZ") country: string; // Country code (e.g., "US") cities: string[]; // Array of city names } /** * Fetch all states with their cities via direct GraphQL query * * Uses the getAllCitiesByState persisted query which returns all states * and cities where Dutchie has dispensaries. */ export async function fetchStatesWithDispensaries( options: { verbose?: boolean } = {} ): Promise { const { verbose = false } = options; // Initialize proxy if USE_PROXY=true await initDiscoveryProxy(); console.log('[LocationDiscovery] Fetching statesWithDispensaries via GraphQL...'); try { // Use direct GraphQL query - much cleaner than scraping __NEXT_DATA__ const result = await executeGraphQL( 'getAllCitiesByState', {}, // No variables needed GRAPHQL_HASHES.GetAllCitiesByState, { maxRetries: 3, retryOn403: true } ); const statesData = result?.data?.statesWithDispensaries; if (!Array.isArray(statesData)) { console.error('[LocationDiscovery] statesWithDispensaries not found in response'); return []; } // Map to our StateWithCities format const states: StateWithCities[] = []; for (const state of statesData) { if (state && state.name) { // Filter out null cities const cities = Array.isArray(state.cities) ? state.cities.filter((c: string | null) => c !== null) : []; states.push({ name: state.name, country: state.country || 'US', cities, }); } } if (verbose) { console.log(`[LocationDiscovery] Found ${states.length} states`); for (const state of states) { console.log(` ${state.name}: ${state.cities.length} cities`); } } console.log(`[LocationDiscovery] Loaded ${states.length} states with cities`); return states; } catch (error: any) { console.error(`[LocationDiscovery] Failed to fetch states: ${error.message}`); return []; } } /** * Get cities for a specific state */ export async function getCitiesForState( stateCode: string, options: { verbose?: boolean } = {} ): Promise { const states = await fetchStatesWithDispensaries(options); const state = states.find(s => s.name.toUpperCase() === stateCode.toUpperCase()); if (!state) { console.warn(`[LocationDiscovery] No cities found for state: ${stateCode}`); return []; } console.log(`[LocationDiscovery] Found ${state.cities.length} cities for ${stateCode}`); return state.cities; } /** * Fetch dispensaries for a specific city+state using GraphQL * * This is the CORRECT method for establishing database data: * Uses city + state filter, NOT geo-coordinates. */ export async function fetchDispensariesByCityState( city: string, stateCode: string, options: { verbose?: boolean; perPage?: number; maxPages?: number } = {} ): Promise { const { verbose = false, perPage = 200, maxPages = 10 } = options; // Initialize proxy if USE_PROXY=true (state-specific proxy preferred) await initDiscoveryProxy(stateCode); console.log(`[LocationDiscovery] Fetching dispensaries for ${city}, ${stateCode}...`); const allDispensaries: any[] = []; let page = 0; let hasMore = true; while (hasMore && page < maxPages) { const variables = { dispensaryFilter: { activeOnly: true, city: city, state: stateCode, }, page, perPage, }; try { const result = await executeGraphQL( 'ConsumerDispensaries', variables, GRAPHQL_HASHES.ConsumerDispensaries, { cName: `${city.toLowerCase().replace(/\s+/g, '-')}-${stateCode.toLowerCase()}`, maxRetries: 2, retryOn403: true } ); const dispensaries = result?.data?.filteredDispensaries || []; if (verbose) { console.log(`[LocationDiscovery] Page ${page}: ${dispensaries.length} dispensaries`); } if (dispensaries.length === 0) { hasMore = false; } else { // Filter to ensure we only get dispensaries in the correct state const stateFiltered = dispensaries.filter((d: any) => d.location?.state?.toUpperCase() === stateCode.toUpperCase() ); allDispensaries.push(...stateFiltered); if (dispensaries.length < perPage) { hasMore = false; } else { page++; } } } catch (error: any) { console.error(`[LocationDiscovery] Error fetching page ${page}: ${error.message}`); hasMore = false; } } // Dedupe by ID const uniqueMap = new Map(); for (const d of allDispensaries) { const id = d.id || d._id; if (id && !uniqueMap.has(id)) { uniqueMap.set(id, d); } } const unique = Array.from(uniqueMap.values()); console.log(`[LocationDiscovery] Found ${unique.length} unique dispensaries in ${city}, ${stateCode}`); return unique.map(d => normalizeLocationResponse(d)); } /** * Fetch ALL dispensaries for a state by querying each city * * This is the canonical method for establishing state data: * 1. Get city list from statesWithDispensaries * 2. Query each city using city+state filter * 3. Dedupe and return all dispensaries */ export async function fetchAllDispensariesForState( stateCode: string, options: { verbose?: boolean; progressCallback?: (city: string, count: number, total: number) => void } = {} ): Promise<{ dispensaries: DutchieLocationResponse[]; citiesQueried: number; citiesWithResults: number }> { const { verbose = false, progressCallback } = options; console.log(`[LocationDiscovery] Fetching all dispensaries for ${stateCode}...`); // Step 1: Get city list const cities = await getCitiesForState(stateCode, { verbose }); if (cities.length === 0) { console.warn(`[LocationDiscovery] No cities found for ${stateCode}`); return { dispensaries: [], citiesQueried: 0, citiesWithResults: 0 }; } console.log(`[LocationDiscovery] Will query ${cities.length} cities for ${stateCode}`); // Step 2: Query each city const allDispensaries = new Map(); let citiesWithResults = 0; for (let i = 0; i < cities.length; i++) { const city = cities[i]; if (progressCallback) { progressCallback(city, i + 1, cities.length); } try { const dispensaries = await fetchDispensariesByCityState(city, stateCode, { verbose }); if (dispensaries.length > 0) { citiesWithResults++; for (const d of dispensaries) { const id = d.id || d.slug; if (id && !allDispensaries.has(id)) { allDispensaries.set(id, d); } } } // Small delay between cities to avoid rate limiting await new Promise(r => setTimeout(r, 300)); } catch (error: any) { console.error(`[LocationDiscovery] Error querying ${city}: ${error.message}`); } } const result = Array.from(allDispensaries.values()); console.log(`[LocationDiscovery] Total: ${result.length} unique dispensaries across ${citiesWithResults}/${cities.length} cities`); return { dispensaries: result, citiesQueried: cities.length, citiesWithResults, }; } // ============================================================ // GRAPHQL / API FETCHING (LEGACY - PUPPETEER-BASED) // ============================================================ interface SessionCredentials { cookies: string; userAgent: string; browser: Browser; page: Page; } /** * Create a browser session for fetching location data. */ async function createSession(citySlug: string): Promise { const browser = await puppeteer.launch({ headless: 'new', args: [ '--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', ], }); const page = await browser.newPage(); const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; await page.setUserAgent(userAgent); await page.setViewport({ width: 1920, height: 1080 }); await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, 'webdriver', { get: () => false }); (window as any).chrome = { runtime: {} }; }); // Navigate to a dispensaries page to get cookies const url = `https://dutchie.com/dispensaries/az/${citySlug}`; console.log(`[LocationDiscovery] Loading ${url} to establish session...`); try { await page.goto(url, { waitUntil: 'networkidle2', timeout: 60000, }); await new Promise((r) => setTimeout(r, 2000)); } catch (error: any) { console.warn(`[LocationDiscovery] Navigation warning: ${error.message}`); } const cookies = await page.cookies(); const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; '); return { cookies: cookieString, userAgent, browser, page }; } async function closeSession(session: SessionCredentials): Promise { await session.browser.close(); } /** * Fetch locations for a city. * * PRIMARY METHOD: Uses city+state GraphQL filter (source of truth) * FALLBACK: Legacy Puppeteer-based methods for edge cases */ export async function fetchLocationsForCity( city: DiscoveryCity, options: { session?: SessionCredentials; verbose?: boolean; useLegacyMethods?: boolean; } = {} ): Promise { const { verbose = false, useLegacyMethods = false } = options; console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`); // PRIMARY METHOD: City+State GraphQL query (SOURCE OF TRUTH) if (city.cityName && city.stateCode) { try { const locations = await fetchDispensariesByCityState(city.cityName, city.stateCode, { verbose }); if (locations.length > 0) { console.log(`[LocationDiscovery] Found ${locations.length} locations via GraphQL city+state`); return locations; } } catch (error: any) { console.warn(`[LocationDiscovery] GraphQL city+state failed: ${error.message}`); } } // FALLBACK: Legacy Puppeteer-based methods (only if explicitly enabled) if (useLegacyMethods) { let session = options.session; let shouldCloseSession = false; if (!session) { session = await createSession(city.citySlug); shouldCloseSession = true; } try { // Legacy Approach 1: Extract from page __NEXT_DATA__ const locations = await extractLocationsFromPage(session.page, verbose); if (locations.length > 0) { console.log(`[LocationDiscovery] Found ${locations.length} locations from page data (legacy)`); return locations; } // Legacy Approach 2: Try the geo-based GraphQL query // NOTE: Geo queries are for SEARCH RESULTS only, not source of truth const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose); if (geoLocations.length > 0) { console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from geo GraphQL (legacy)`); return geoLocations; } // Legacy Approach 3: Scrape visible location cards const scrapedLocations = await scrapeLocationCards(session.page, verbose); if (scrapedLocations.length > 0) { console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping (legacy)`); return scrapedLocations; } } finally { if (shouldCloseSession) { await closeSession(session); } } } console.log(`[LocationDiscovery] No locations found for ${city.cityName}`); return []; } /** * Extract locations from page's embedded data (__NEXT_DATA__, window.*, etc.) */ async function extractLocationsFromPage( page: Page, verbose: boolean ): Promise { try { const data = await page.evaluate(() => { // Try __NEXT_DATA__ const nextDataEl = document.querySelector('#__NEXT_DATA__'); if (nextDataEl?.textContent) { try { const nextData = JSON.parse(nextDataEl.textContent); // Look for dispensaries in various paths const dispensaries = nextData?.props?.pageProps?.dispensaries || nextData?.props?.pageProps?.initialDispensaries || nextData?.props?.pageProps?.data?.dispensaries || []; if (Array.isArray(dispensaries) && dispensaries.length > 0) { return { source: '__NEXT_DATA__', dispensaries }; } } catch { // Ignore parse errors } } // Try window variables const win = window as any; if (win.__APOLLO_STATE__) { // Extract from Apollo cache const entries = Object.entries(win.__APOLLO_STATE__).filter( ([key]) => key.startsWith('Dispensary:') ); if (entries.length > 0) { return { source: 'APOLLO_STATE', dispensaries: entries.map(([, v]) => v) }; } } return { source: 'none', dispensaries: [] }; }); if (verbose) { console.log(`[LocationDiscovery] Page data source: ${data.source}, count: ${data.dispensaries.length}`); } return data.dispensaries.map((d: any) => normalizeLocationResponse(d)); } catch (error: any) { if (verbose) { console.log(`[LocationDiscovery] Could not extract from page data: ${error.message}`); } return []; } } /** * Fetch locations via GraphQL geo-based query. * * Uses ConsumerDispensaries with geo filtering: * - dispensaryFilter.nearLat/nearLng for center point * - dispensaryFilter.distance for radius in miles * - Response at data.filteredDispensaries */ async function fetchLocationsViaGraphQL( session: SessionCredentials, city: DiscoveryCity, verbose: boolean ): Promise { // City center coordinates with appropriate radius const CITY_COORDS: Record = { 'phoenix': { lat: 33.4484, lng: -112.074, radius: 50 }, 'tucson': { lat: 32.2226, lng: -110.9747, radius: 50 }, 'scottsdale': { lat: 33.4942, lng: -111.9261, radius: 30 }, 'mesa': { lat: 33.4152, lng: -111.8315, radius: 30 }, 'tempe': { lat: 33.4255, lng: -111.94, radius: 30 }, 'flagstaff': { lat: 35.1983, lng: -111.6513, radius: 50 }, }; // State-wide coordinates for full coverage const STATE_COORDS: Record = { 'AZ': { lat: 33.4484, lng: -112.074, radius: 200 }, 'CA': { lat: 36.7783, lng: -119.4179, radius: 400 }, 'CO': { lat: 39.5501, lng: -105.7821, radius: 200 }, 'FL': { lat: 27.6648, lng: -81.5158, radius: 400 }, 'MI': { lat: 44.3148, lng: -85.6024, radius: 250 }, 'NV': { lat: 36.1699, lng: -115.1398, radius: 200 }, }; // Try city-specific coords first, then state-wide, then default const coords = CITY_COORDS[city.citySlug] || (city.stateCode && STATE_COORDS[city.stateCode]) || { lat: 33.4484, lng: -112.074, radius: 200 }; // Correct GraphQL variables for ConsumerDispensaries const variables = { dispensaryFilter: { activeOnly: true, nearLat: coords.lat, nearLng: coords.lng, distance: coords.radius, }, page: 0, perPage: 200, }; const hash = '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b'; try { const response = await axios.post( 'https://dutchie.com/api-3/graphql', { operationName: 'ConsumerDispensaries', variables, extensions: { persistedQuery: { version: 1, sha256Hash: hash }, }, }, { headers: { 'content-type': 'application/json', 'origin': 'https://dutchie.com', 'referer': `https://dutchie.com/dispensaries/${city.stateCode?.toLowerCase()}/${city.citySlug}`, 'user-agent': session.userAgent, 'cookie': session.cookies, }, timeout: 30000, validateStatus: () => true, } ); if (response.status !== 200) { if (verbose) { console.log(`[LocationDiscovery] GraphQL returned ${response.status}`); } return []; } // Response is at data.filteredDispensaries const dispensaries = response.data?.data?.filteredDispensaries || []; // Filter to specific state if needed (radius may include neighboring states) const filtered = city.stateCode ? dispensaries.filter((d: any) => d.location?.state === city.stateCode) : dispensaries; if (verbose) { console.log(`[LocationDiscovery] GraphQL returned ${dispensaries.length} total, ${filtered.length} in ${city.stateCode || 'all states'}`); } return filtered.map((d: any) => normalizeLocationResponse(d)); } catch (error: any) { if (verbose) { console.log(`[LocationDiscovery] GraphQL error: ${error.message}`); } return []; } } /** * Scrape location cards from the visible page. */ async function scrapeLocationCards( page: Page, verbose: boolean ): Promise { try { const locations = await page.evaluate(() => { const cards: any[] = []; // Look for common dispensary card patterns const selectors = [ '[data-testid="dispensary-card"]', '.dispensary-card', 'a[href*="/dispensary/"]', '[class*="DispensaryCard"]', ]; for (const selector of selectors) { const elements = document.querySelectorAll(selector); if (elements.length > 0) { elements.forEach((el) => { const link = el.querySelector('a')?.href || (el as HTMLAnchorElement).href || ''; const name = el.querySelector('h2, h3, [class*="name"]')?.textContent?.trim() || ''; const address = el.querySelector('[class*="address"], address')?.textContent?.trim() || ''; // Extract slug from URL const slugMatch = link.match(/\/dispensary\/([^/?]+)/); const slug = slugMatch ? slugMatch[1] : ''; if (slug && name) { cards.push({ slug, name, address, menuUrl: link, }); } }); break; // Stop after first successful selector } } return cards; }); return locations.map((d: any) => ({ id: '', name: d.name, slug: d.slug, address: d.address, menuUrl: d.menuUrl, })); } catch (error: any) { if (verbose) { console.log(`[LocationDiscovery] Scraping error: ${error.message}`); } return []; } } /** * Normalize a raw location response to a consistent format. * Maps Dutchie camelCase fields to our snake_case equivalents. */ function normalizeLocationResponse(raw: any): DutchieLocationResponse { const slug = raw.slug || raw.cName || raw.urlSlug || ''; const id = raw.id || raw._id || raw.dispensaryId || ''; // Extract location data - GraphQL response nests address info in .location const loc = raw.location || {}; // Extract coordinates from geometry.coordinates [longitude, latitude] const coords = loc.geometry?.coordinates || []; const longitude = coords[0] || raw.longitude || raw.lng || loc.longitude || loc.lng; const latitude = coords[1] || raw.latitude || raw.lat || loc.latitude || loc.lat; return { id, name: raw.name || raw.dispensaryName || '', slug, cName: raw.cName || raw.slug || '', address: raw.address || raw.fullAddress || loc.ln1 || '', address1: raw.address1 || raw.addressLine1 || raw.streetAddress || loc.ln1 || '', address2: raw.address2 || raw.addressLine2 || loc.ln2 || '', city: raw.city || loc.city || '', state: raw.state || raw.stateCode || loc.state || '', zip: raw.zip || raw.zipCode || raw.postalCode || loc.zipcode || loc.zip || '', country: raw.country || raw.countryCode || loc.country || 'United States', latitude, longitude, timezone: raw.timezone || raw.tz || '', menuUrl: raw.menuUrl || (slug ? `https://dutchie.com/dispensary/${slug}` : ''), retailType: raw.retailType || raw.type || '', // Service offerings offerPickup: raw.offerPickup ?? raw.storeSettings?.offerPickup ?? true, offerDelivery: raw.offerDelivery ?? raw.storeSettings?.offerDelivery ?? false, offerCurbsidePickup: raw.offerCurbsidePickup ?? false, // License types isRecreational: raw.isRecreational ?? raw.recDispensary ?? raw.retailType?.includes('Recreational') ?? true, isMedical: raw.isMedical ?? raw.medicalDispensary ?? raw.retailType?.includes('Medical') ?? true, // Contact info phone: raw.phone || '', email: raw.email || '', website: raw.embedBackUrl || '', // Branding description: raw.description || '', logoImage: raw.logoImage || '', bannerImage: raw.bannerImage || '', // Chain/enterprise info chainSlug: raw.chain || '', enterpriseId: raw.retailer?.enterpriseId || '', // Status status: raw.status || '', // Preserve raw data ...raw, }; } // ============================================================ // DATABASE OPERATIONS // ============================================================ /** * Upsert a location into dutchie_discovery_locations. * REQUIRES a valid platform ID (MongoDB ObjectId) - will skip records without one. */ export async function upsertLocation( pool: Pool, location: DutchieLocationResponse, cityId: number | null ): Promise<{ id: number; isNew: boolean } | null> { // REQUIRE actual platform ID - NO fallback to slug const platformLocationId = location.id; if (!platformLocationId) { console.warn(`[LocationDiscovery] Skipping location without platform ID: ${location.name} (${location.slug})`); return null; } const menuUrl = location.menuUrl || `https://dutchie.com/dispensary/${location.slug}`; const result = await pool.query( `INSERT INTO dutchie_discovery_locations ( platform, platform_location_id, platform_slug, platform_menu_url, name, raw_address, address_line1, address_line2, city, state_code, postal_code, country_code, latitude, longitude, timezone, discovery_city_id, metadata, offers_delivery, offers_pickup, is_recreational, is_medical, phone, website, email, description, logo_image, banner_image, chain_slug, enterprise_id, c_name, country, store_status, last_seen_at, updated_at ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, NOW(), NOW()) ON CONFLICT (platform, platform_location_id) DO UPDATE SET name = EXCLUDED.name, platform_menu_url = EXCLUDED.platform_menu_url, raw_address = COALESCE(EXCLUDED.raw_address, dutchie_discovery_locations.raw_address), address_line1 = COALESCE(EXCLUDED.address_line1, dutchie_discovery_locations.address_line1), address_line2 = COALESCE(EXCLUDED.address_line2, dutchie_discovery_locations.address_line2), city = COALESCE(EXCLUDED.city, dutchie_discovery_locations.city), state_code = COALESCE(EXCLUDED.state_code, dutchie_discovery_locations.state_code), postal_code = COALESCE(EXCLUDED.postal_code, dutchie_discovery_locations.postal_code), latitude = COALESCE(EXCLUDED.latitude, dutchie_discovery_locations.latitude), longitude = COALESCE(EXCLUDED.longitude, dutchie_discovery_locations.longitude), timezone = COALESCE(EXCLUDED.timezone, dutchie_discovery_locations.timezone), metadata = EXCLUDED.metadata, offers_delivery = COALESCE(EXCLUDED.offers_delivery, dutchie_discovery_locations.offers_delivery), offers_pickup = COALESCE(EXCLUDED.offers_pickup, dutchie_discovery_locations.offers_pickup), is_recreational = COALESCE(EXCLUDED.is_recreational, dutchie_discovery_locations.is_recreational), is_medical = COALESCE(EXCLUDED.is_medical, dutchie_discovery_locations.is_medical), phone = COALESCE(EXCLUDED.phone, dutchie_discovery_locations.phone), website = COALESCE(EXCLUDED.website, dutchie_discovery_locations.website), email = COALESCE(EXCLUDED.email, dutchie_discovery_locations.email), description = COALESCE(EXCLUDED.description, dutchie_discovery_locations.description), logo_image = COALESCE(EXCLUDED.logo_image, dutchie_discovery_locations.logo_image), banner_image = COALESCE(EXCLUDED.banner_image, dutchie_discovery_locations.banner_image), chain_slug = COALESCE(EXCLUDED.chain_slug, dutchie_discovery_locations.chain_slug), enterprise_id = COALESCE(EXCLUDED.enterprise_id, dutchie_discovery_locations.enterprise_id), c_name = COALESCE(EXCLUDED.c_name, dutchie_discovery_locations.c_name), country = COALESCE(EXCLUDED.country, dutchie_discovery_locations.country), store_status = COALESCE(EXCLUDED.store_status, dutchie_discovery_locations.store_status), last_seen_at = NOW(), updated_at = NOW() RETURNING id, (xmax = 0) as is_new`, [ PLATFORM, platformLocationId, location.slug, menuUrl, location.name, location.address || null, location.address1 || null, location.address2 || null, location.city || null, location.state || null, location.zip || null, location.country || 'United States', location.latitude || null, location.longitude || null, location.timezone || null, cityId, JSON.stringify(location), location.offerDelivery ?? null, location.offerPickup ?? null, location.isRecreational ?? null, location.isMedical ?? null, location.phone || null, location.website || null, location.email || null, location.description || null, location.logoImage || null, location.bannerImage || null, location.chainSlug || null, location.enterpriseId || null, location.cName || null, location.country || 'United States', location.status || null, ] ); return { id: result.rows[0].id, isNew: result.rows[0].is_new, }; } /** * Get locations by status. */ export async function getLocationsByStatus( pool: Pool, status: DiscoveryStatus, options: { stateCode?: string; countryCode?: string; limit?: number; offset?: number; } = {} ): Promise { const { stateCode, countryCode, limit = 100, offset = 0 } = options; let query = ` SELECT * FROM dutchie_discovery_locations WHERE status = $1 AND active = TRUE `; const params: any[] = [status]; let paramIdx = 2; if (stateCode) { query += ` AND state_code = $${paramIdx}`; params.push(stateCode); paramIdx++; } if (countryCode) { query += ` AND country_code = $${paramIdx}`; params.push(countryCode); paramIdx++; } query += ` ORDER BY first_seen_at DESC LIMIT $${paramIdx} OFFSET $${paramIdx + 1}`; params.push(limit, offset); const result = await pool.query(query, params); return result.rows.map(mapLocationRowToLocation); } /** * Get a location by ID. */ export async function getLocationById( pool: Pool, id: number ): Promise { const result = await pool.query( `SELECT * FROM dutchie_discovery_locations WHERE id = $1`, [id] ); if (result.rows.length === 0) { return null; } return mapLocationRowToLocation(result.rows[0]); } /** * Update location status. */ export async function updateLocationStatus( pool: Pool, locationId: number, status: DiscoveryStatus, options: { dispensaryId?: number; verifiedBy?: string; notes?: string; } = {} ): Promise { const { dispensaryId, verifiedBy, notes } = options; await pool.query( `UPDATE dutchie_discovery_locations SET status = $2, dispensary_id = COALESCE($3, dispensary_id), verified_at = CASE WHEN $2 IN ('verified', 'merged') THEN NOW() ELSE verified_at END, verified_by = COALESCE($4, verified_by), notes = COALESCE($5, notes), updated_at = NOW() WHERE id = $1`, [locationId, status, dispensaryId || null, verifiedBy || null, notes || null] ); } /** * Search locations by name or address. */ export async function searchLocations( pool: Pool, query: string, options: { status?: DiscoveryStatus; stateCode?: string; limit?: number; } = {} ): Promise { const { status, stateCode, limit = 50 } = options; const searchPattern = `%${query}%`; let sql = ` SELECT * FROM dutchie_discovery_locations WHERE active = TRUE AND (name ILIKE $1 OR city ILIKE $1 OR raw_address ILIKE $1 OR platform_slug ILIKE $1) `; const params: any[] = [searchPattern]; let paramIdx = 2; if (status) { sql += ` AND status = $${paramIdx}`; params.push(status); paramIdx++; } if (stateCode) { sql += ` AND state_code = $${paramIdx}`; params.push(stateCode); paramIdx++; } sql += ` ORDER BY name LIMIT $${paramIdx}`; params.push(limit); const result = await pool.query(sql, params); return result.rows.map(mapLocationRowToLocation); } // ============================================================ // MAIN DISCOVERY FUNCTION // ============================================================ /** * Discover locations for a specific city. */ export async function discoverLocationsForCity( pool: Pool, city: DiscoveryCity, options: { dryRun?: boolean; verbose?: boolean; } = {} ): Promise { const startTime = Date.now(); const { dryRun = false, verbose = false } = options; const errors: string[] = []; console.log(`[LocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`); console.log(`[LocationDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`); const locations = await fetchLocationsForCity(city, { verbose }); if (locations.length === 0) { console.log(`[LocationDiscovery] No locations found for ${city.cityName}`); return { cityId: city.id, citySlug: city.citySlug, locationsFound: 0, locationsUpserted: 0, locationsNew: 0, locationsUpdated: 0, errors: [], durationMs: Date.now() - startTime, }; } let newCount = 0; let updatedCount = 0; for (const location of locations) { try { if (dryRun) { if (verbose) { console.log(`[LocationDiscovery][DryRun] Would upsert: ${location.name} (${location.slug})`); } newCount++; continue; } const result = await upsertLocation(pool, location, city.id); // Skip locations without valid platform ID if (!result) { errors.push(`Location ${location.slug}: No valid platform ID - skipped`); continue; } if (result.isNew) { newCount++; } else { updatedCount++; } if (verbose) { const action = result.isNew ? 'Created' : 'Updated'; console.log(`[LocationDiscovery] ${action}: ${location.name} -> ID ${result.id}`); } } catch (error: any) { errors.push(`Location ${location.slug}: ${error.message}`); } } // Update city crawl status if (!dryRun) { await pool.query( `UPDATE dutchie_discovery_cities SET last_crawled_at = NOW(), location_count = $2, updated_at = NOW() WHERE id = $1`, [city.id, locations.length] ); } const durationMs = Date.now() - startTime; console.log(`[LocationDiscovery] Complete for ${city.cityName}: ${newCount} new, ${updatedCount} updated, ${errors.length} errors in ${durationMs}ms`); return { cityId: city.id, citySlug: city.citySlug, locationsFound: locations.length, locationsUpserted: newCount + updatedCount, locationsNew: newCount, locationsUpdated: updatedCount, errors, durationMs, }; }