## SEO Template Library - Add complete template library with 7 page types (state, city, category, brand, product, search, regeneration) - Add Template Library tab in SEO Orchestrator with accordion-based editors - Add template preview, validation, and variable injection engine - Add API endpoints: /api/seo/templates, preview, validate, generate, regenerate ## Discovery Pipeline - Add promotion.ts for discovery location validation and promotion - Add discover-all-states.ts script for multi-state discovery - Add promotion log migration (067) - Enhance discovery routes and types ## Orchestrator & Admin - Add crawl_enabled filter to stores page - Add API permissions page - Add job queue management - Add price analytics routes - Add markets and intelligence routes - Enhance dashboard and worker monitoring ## Infrastructure - Add migrations for worker definitions, SEO settings, field alignment - Add canonical pipeline for scraper v2 - Update hydration and sync orchestrator - Enhance multi-state query service 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1143 lines
36 KiB
TypeScript
1143 lines
36 KiB
TypeScript
/**
|
|
* Dutchie Location Discovery Service
|
|
*
|
|
* Discovers store locations from Dutchie city pages.
|
|
* Each city can contain multiple dispensary locations.
|
|
*
|
|
* This module:
|
|
* 1. Fetches location listings for a given city
|
|
* 2. Upserts locations into dutchie_discovery_locations
|
|
* 3. Does NOT create any canonical dispensary records
|
|
*
|
|
* Locations remain in "discovered" status until manually verified.
|
|
*/
|
|
|
|
import { Pool } from 'pg';
|
|
import axios from 'axios';
|
|
import puppeteer from 'puppeteer-extra';
|
|
import type { Browser, Page, Protocol } from 'puppeteer';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
import {
|
|
DiscoveryLocation,
|
|
DiscoveryLocationRow,
|
|
DutchieLocationResponse,
|
|
LocationDiscoveryResult,
|
|
DiscoveryStatus,
|
|
mapLocationRowToLocation,
|
|
} from './types';
|
|
import { DiscoveryCity } from './types';
|
|
import {
|
|
executeGraphQL,
|
|
fetchPage,
|
|
extractNextData,
|
|
GRAPHQL_HASHES,
|
|
setProxy,
|
|
} from '../platforms/dutchie/client';
|
|
import { getStateProxy, getRandomProxy } from '../utils/proxyManager';
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
// ============================================================
|
|
// PROXY INITIALIZATION
|
|
// ============================================================
|
|
// Call initDiscoveryProxy() before any discovery operations to
|
|
// set up proxy if USE_PROXY=true environment variable is set.
|
|
// This is opt-in and does NOT break existing behavior.
|
|
// ============================================================
|
|
|
|
let proxyInitialized = false;
|
|
|
|
/**
|
|
* Initialize proxy for discovery operations
|
|
* Only runs if USE_PROXY=true is set in environment
|
|
* Safe to call multiple times - only initializes once
|
|
*
|
|
* @param stateCode - Optional state code for state-specific proxy (e.g., 'AZ', 'CA')
|
|
* @returns true if proxy was set, false if skipped or failed
|
|
*/
|
|
export async function initDiscoveryProxy(stateCode?: string): Promise<boolean> {
|
|
// Skip if already initialized
|
|
if (proxyInitialized) {
|
|
return true;
|
|
}
|
|
|
|
// Skip if USE_PROXY is not enabled
|
|
if (process.env.USE_PROXY !== 'true') {
|
|
console.log('[LocationDiscovery] Proxy disabled (USE_PROXY != true)');
|
|
return false;
|
|
}
|
|
|
|
try {
|
|
// Get proxy - prefer state-specific if state code provided
|
|
const proxyConfig = stateCode
|
|
? await getStateProxy(stateCode)
|
|
: await getRandomProxy();
|
|
|
|
if (!proxyConfig) {
|
|
console.warn('[LocationDiscovery] No proxy available, proceeding without proxy');
|
|
return false;
|
|
}
|
|
|
|
// Build proxy URL with auth if needed
|
|
let proxyUrl = proxyConfig.server;
|
|
if (proxyConfig.username && proxyConfig.password) {
|
|
const url = new URL(proxyConfig.server);
|
|
url.username = proxyConfig.username;
|
|
url.password = proxyConfig.password;
|
|
proxyUrl = url.toString();
|
|
}
|
|
|
|
// Set proxy on the Dutchie client
|
|
setProxy(proxyUrl);
|
|
proxyInitialized = true;
|
|
|
|
console.log(`[LocationDiscovery] Proxy initialized for ${stateCode || 'general'} discovery`);
|
|
return true;
|
|
} catch (error: any) {
|
|
console.error(`[LocationDiscovery] Failed to initialize proxy: ${error.message}`);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reset proxy initialization flag (for testing or re-initialization)
|
|
*/
|
|
export function resetProxyInit(): void {
|
|
proxyInitialized = false;
|
|
setProxy(null);
|
|
}
|
|
|
|
const PLATFORM = 'dutchie';
|
|
|
|
// ============================================================
|
|
// CITY-BASED DISCOVERY (CANONICAL SOURCE OF TRUTH)
|
|
// ============================================================
|
|
// GraphQL with city+state filter is the SOURCE OF TRUTH for database data.
|
|
//
|
|
// Method:
|
|
// 1. Get city list from statesWithDispensaries (in __NEXT_DATA__)
|
|
// 2. Query stores per city using city + state GraphQL filter
|
|
// 3. This gives us complete, accurate dispensary data
|
|
//
|
|
// Geo-coordinate queries (nearLat/nearLng) are ONLY for showing search
|
|
// results to users (e.g., "stores within 20 miles of me").
|
|
// They are NOT a source of truth for establishing database records.
|
|
// ============================================================
|
|
|
|
/**
|
|
* State with dispensary cities from Dutchie's statesWithDispensaries data
|
|
*/
|
|
export interface StateWithCities {
|
|
name: string; // State code (e.g., "CA", "AZ")
|
|
country: string; // Country code (e.g., "US")
|
|
cities: string[]; // Array of city names
|
|
}
|
|
|
|
/**
|
|
* Fetch all states with their cities via direct GraphQL query
|
|
*
|
|
* Uses the getAllCitiesByState persisted query which returns all states
|
|
* and cities where Dutchie has dispensaries.
|
|
*/
|
|
export async function fetchStatesWithDispensaries(
|
|
options: { verbose?: boolean } = {}
|
|
): Promise<StateWithCities[]> {
|
|
const { verbose = false } = options;
|
|
|
|
// Initialize proxy if USE_PROXY=true
|
|
await initDiscoveryProxy();
|
|
|
|
console.log('[LocationDiscovery] Fetching statesWithDispensaries via GraphQL...');
|
|
|
|
try {
|
|
// Use direct GraphQL query - much cleaner than scraping __NEXT_DATA__
|
|
const result = await executeGraphQL(
|
|
'getAllCitiesByState',
|
|
{}, // No variables needed
|
|
GRAPHQL_HASHES.GetAllCitiesByState,
|
|
{ maxRetries: 3, retryOn403: true }
|
|
);
|
|
|
|
const statesData = result?.data?.statesWithDispensaries;
|
|
if (!Array.isArray(statesData)) {
|
|
console.error('[LocationDiscovery] statesWithDispensaries not found in response');
|
|
return [];
|
|
}
|
|
|
|
// Map to our StateWithCities format
|
|
const states: StateWithCities[] = [];
|
|
for (const state of statesData) {
|
|
if (state && state.name) {
|
|
// Filter out null cities
|
|
const cities = Array.isArray(state.cities)
|
|
? state.cities.filter((c: string | null) => c !== null)
|
|
: [];
|
|
|
|
states.push({
|
|
name: state.name,
|
|
country: state.country || 'US',
|
|
cities,
|
|
});
|
|
}
|
|
}
|
|
|
|
if (verbose) {
|
|
console.log(`[LocationDiscovery] Found ${states.length} states`);
|
|
for (const state of states) {
|
|
console.log(` ${state.name}: ${state.cities.length} cities`);
|
|
}
|
|
}
|
|
|
|
console.log(`[LocationDiscovery] Loaded ${states.length} states with cities`);
|
|
return states;
|
|
} catch (error: any) {
|
|
console.error(`[LocationDiscovery] Failed to fetch states: ${error.message}`);
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get cities for a specific state
|
|
*/
|
|
export async function getCitiesForState(
|
|
stateCode: string,
|
|
options: { verbose?: boolean } = {}
|
|
): Promise<string[]> {
|
|
const states = await fetchStatesWithDispensaries(options);
|
|
const state = states.find(s => s.name.toUpperCase() === stateCode.toUpperCase());
|
|
|
|
if (!state) {
|
|
console.warn(`[LocationDiscovery] No cities found for state: ${stateCode}`);
|
|
return [];
|
|
}
|
|
|
|
console.log(`[LocationDiscovery] Found ${state.cities.length} cities for ${stateCode}`);
|
|
return state.cities;
|
|
}
|
|
|
|
/**
|
|
* Fetch dispensaries for a specific city+state using GraphQL
|
|
*
|
|
* This is the CORRECT method for establishing database data:
|
|
* Uses city + state filter, NOT geo-coordinates.
|
|
*/
|
|
export async function fetchDispensariesByCityState(
|
|
city: string,
|
|
stateCode: string,
|
|
options: { verbose?: boolean; perPage?: number; maxPages?: number } = {}
|
|
): Promise<DutchieLocationResponse[]> {
|
|
const { verbose = false, perPage = 200, maxPages = 10 } = options;
|
|
|
|
// Initialize proxy if USE_PROXY=true (state-specific proxy preferred)
|
|
await initDiscoveryProxy(stateCode);
|
|
|
|
console.log(`[LocationDiscovery] Fetching dispensaries for ${city}, ${stateCode}...`);
|
|
|
|
const allDispensaries: any[] = [];
|
|
let page = 0;
|
|
let hasMore = true;
|
|
|
|
while (hasMore && page < maxPages) {
|
|
const variables = {
|
|
dispensaryFilter: {
|
|
activeOnly: true,
|
|
city: city,
|
|
state: stateCode,
|
|
},
|
|
page,
|
|
perPage,
|
|
};
|
|
|
|
try {
|
|
const result = await executeGraphQL(
|
|
'ConsumerDispensaries',
|
|
variables,
|
|
GRAPHQL_HASHES.ConsumerDispensaries,
|
|
{ cName: `${city.toLowerCase().replace(/\s+/g, '-')}-${stateCode.toLowerCase()}`, maxRetries: 2, retryOn403: true }
|
|
);
|
|
|
|
const dispensaries = result?.data?.filteredDispensaries || [];
|
|
|
|
if (verbose) {
|
|
console.log(`[LocationDiscovery] Page ${page}: ${dispensaries.length} dispensaries`);
|
|
}
|
|
|
|
if (dispensaries.length === 0) {
|
|
hasMore = false;
|
|
} else {
|
|
// Filter to ensure we only get dispensaries in the correct state
|
|
const stateFiltered = dispensaries.filter((d: any) =>
|
|
d.location?.state?.toUpperCase() === stateCode.toUpperCase()
|
|
);
|
|
allDispensaries.push(...stateFiltered);
|
|
|
|
if (dispensaries.length < perPage) {
|
|
hasMore = false;
|
|
} else {
|
|
page++;
|
|
}
|
|
}
|
|
} catch (error: any) {
|
|
console.error(`[LocationDiscovery] Error fetching page ${page}: ${error.message}`);
|
|
hasMore = false;
|
|
}
|
|
}
|
|
|
|
// Dedupe by ID
|
|
const uniqueMap = new Map<string, any>();
|
|
for (const d of allDispensaries) {
|
|
const id = d.id || d._id;
|
|
if (id && !uniqueMap.has(id)) {
|
|
uniqueMap.set(id, d);
|
|
}
|
|
}
|
|
|
|
const unique = Array.from(uniqueMap.values());
|
|
console.log(`[LocationDiscovery] Found ${unique.length} unique dispensaries in ${city}, ${stateCode}`);
|
|
|
|
return unique.map(d => normalizeLocationResponse(d));
|
|
}
|
|
|
|
/**
|
|
* Fetch ALL dispensaries for a state by querying each city
|
|
*
|
|
* This is the canonical method for establishing state data:
|
|
* 1. Get city list from statesWithDispensaries
|
|
* 2. Query each city using city+state filter
|
|
* 3. Dedupe and return all dispensaries
|
|
*/
|
|
export async function fetchAllDispensariesForState(
|
|
stateCode: string,
|
|
options: { verbose?: boolean; progressCallback?: (city: string, count: number, total: number) => void } = {}
|
|
): Promise<{ dispensaries: DutchieLocationResponse[]; citiesQueried: number; citiesWithResults: number }> {
|
|
const { verbose = false, progressCallback } = options;
|
|
|
|
console.log(`[LocationDiscovery] Fetching all dispensaries for ${stateCode}...`);
|
|
|
|
// Step 1: Get city list
|
|
const cities = await getCitiesForState(stateCode, { verbose });
|
|
if (cities.length === 0) {
|
|
console.warn(`[LocationDiscovery] No cities found for ${stateCode}`);
|
|
return { dispensaries: [], citiesQueried: 0, citiesWithResults: 0 };
|
|
}
|
|
|
|
console.log(`[LocationDiscovery] Will query ${cities.length} cities for ${stateCode}`);
|
|
|
|
// Step 2: Query each city
|
|
const allDispensaries = new Map<string, DutchieLocationResponse>();
|
|
let citiesWithResults = 0;
|
|
|
|
for (let i = 0; i < cities.length; i++) {
|
|
const city = cities[i];
|
|
|
|
if (progressCallback) {
|
|
progressCallback(city, i + 1, cities.length);
|
|
}
|
|
|
|
try {
|
|
const dispensaries = await fetchDispensariesByCityState(city, stateCode, { verbose });
|
|
|
|
if (dispensaries.length > 0) {
|
|
citiesWithResults++;
|
|
for (const d of dispensaries) {
|
|
const id = d.id || d.slug;
|
|
if (id && !allDispensaries.has(id)) {
|
|
allDispensaries.set(id, d);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Small delay between cities to avoid rate limiting
|
|
await new Promise(r => setTimeout(r, 300));
|
|
} catch (error: any) {
|
|
console.error(`[LocationDiscovery] Error querying ${city}: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
const result = Array.from(allDispensaries.values());
|
|
console.log(`[LocationDiscovery] Total: ${result.length} unique dispensaries across ${citiesWithResults}/${cities.length} cities`);
|
|
|
|
return {
|
|
dispensaries: result,
|
|
citiesQueried: cities.length,
|
|
citiesWithResults,
|
|
};
|
|
}
|
|
|
|
// ============================================================
|
|
// GRAPHQL / API FETCHING (LEGACY - PUPPETEER-BASED)
|
|
// ============================================================
|
|
|
|
interface SessionCredentials {
|
|
cookies: string;
|
|
userAgent: string;
|
|
browser: Browser;
|
|
page: Page;
|
|
}
|
|
|
|
/**
|
|
* Create a browser session for fetching location data.
|
|
*/
|
|
async function createSession(citySlug: string): Promise<SessionCredentials> {
|
|
const browser = await puppeteer.launch({
|
|
headless: 'new',
|
|
args: [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-dev-shm-usage',
|
|
'--disable-blink-features=AutomationControlled',
|
|
],
|
|
});
|
|
|
|
const page = await browser.newPage();
|
|
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
|
|
await page.setUserAgent(userAgent);
|
|
await page.setViewport({ width: 1920, height: 1080 });
|
|
await page.evaluateOnNewDocument(() => {
|
|
Object.defineProperty(navigator, 'webdriver', { get: () => false });
|
|
(window as any).chrome = { runtime: {} };
|
|
});
|
|
|
|
// Navigate to a dispensaries page to get cookies
|
|
const url = `https://dutchie.com/dispensaries/az/${citySlug}`;
|
|
console.log(`[LocationDiscovery] Loading ${url} to establish session...`);
|
|
|
|
try {
|
|
await page.goto(url, {
|
|
waitUntil: 'networkidle2',
|
|
timeout: 60000,
|
|
});
|
|
await new Promise((r) => setTimeout(r, 2000));
|
|
} catch (error: any) {
|
|
console.warn(`[LocationDiscovery] Navigation warning: ${error.message}`);
|
|
}
|
|
|
|
const cookies = await page.cookies();
|
|
const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; ');
|
|
|
|
return { cookies: cookieString, userAgent, browser, page };
|
|
}
|
|
|
|
async function closeSession(session: SessionCredentials): Promise<void> {
|
|
await session.browser.close();
|
|
}
|
|
|
|
/**
|
|
* Fetch locations for a city.
|
|
*
|
|
* PRIMARY METHOD: Uses city+state GraphQL filter (source of truth)
|
|
* FALLBACK: Legacy Puppeteer-based methods for edge cases
|
|
*/
|
|
export async function fetchLocationsForCity(
|
|
city: DiscoveryCity,
|
|
options: {
|
|
session?: SessionCredentials;
|
|
verbose?: boolean;
|
|
useLegacyMethods?: boolean;
|
|
} = {}
|
|
): Promise<DutchieLocationResponse[]> {
|
|
const { verbose = false, useLegacyMethods = false } = options;
|
|
|
|
console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
|
|
|
|
// PRIMARY METHOD: City+State GraphQL query (SOURCE OF TRUTH)
|
|
if (city.cityName && city.stateCode) {
|
|
try {
|
|
const locations = await fetchDispensariesByCityState(city.cityName, city.stateCode, { verbose });
|
|
if (locations.length > 0) {
|
|
console.log(`[LocationDiscovery] Found ${locations.length} locations via GraphQL city+state`);
|
|
return locations;
|
|
}
|
|
} catch (error: any) {
|
|
console.warn(`[LocationDiscovery] GraphQL city+state failed: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
// FALLBACK: Legacy Puppeteer-based methods (only if explicitly enabled)
|
|
if (useLegacyMethods) {
|
|
let session = options.session;
|
|
let shouldCloseSession = false;
|
|
|
|
if (!session) {
|
|
session = await createSession(city.citySlug);
|
|
shouldCloseSession = true;
|
|
}
|
|
|
|
try {
|
|
// Legacy Approach 1: Extract from page __NEXT_DATA__
|
|
const locations = await extractLocationsFromPage(session.page, verbose);
|
|
if (locations.length > 0) {
|
|
console.log(`[LocationDiscovery] Found ${locations.length} locations from page data (legacy)`);
|
|
return locations;
|
|
}
|
|
|
|
// Legacy Approach 2: Try the geo-based GraphQL query
|
|
// NOTE: Geo queries are for SEARCH RESULTS only, not source of truth
|
|
const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose);
|
|
if (geoLocations.length > 0) {
|
|
console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from geo GraphQL (legacy)`);
|
|
return geoLocations;
|
|
}
|
|
|
|
// Legacy Approach 3: Scrape visible location cards
|
|
const scrapedLocations = await scrapeLocationCards(session.page, verbose);
|
|
if (scrapedLocations.length > 0) {
|
|
console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping (legacy)`);
|
|
return scrapedLocations;
|
|
}
|
|
} finally {
|
|
if (shouldCloseSession) {
|
|
await closeSession(session);
|
|
}
|
|
}
|
|
}
|
|
|
|
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
|
return [];
|
|
}
|
|
|
|
/**
|
|
* Extract locations from page's embedded data (__NEXT_DATA__, window.*, etc.)
|
|
*/
|
|
async function extractLocationsFromPage(
|
|
page: Page,
|
|
verbose: boolean
|
|
): Promise<DutchieLocationResponse[]> {
|
|
try {
|
|
const data = await page.evaluate(() => {
|
|
// Try __NEXT_DATA__
|
|
const nextDataEl = document.querySelector('#__NEXT_DATA__');
|
|
if (nextDataEl?.textContent) {
|
|
try {
|
|
const nextData = JSON.parse(nextDataEl.textContent);
|
|
// Look for dispensaries in various paths
|
|
const dispensaries =
|
|
nextData?.props?.pageProps?.dispensaries ||
|
|
nextData?.props?.pageProps?.initialDispensaries ||
|
|
nextData?.props?.pageProps?.data?.dispensaries ||
|
|
[];
|
|
if (Array.isArray(dispensaries) && dispensaries.length > 0) {
|
|
return { source: '__NEXT_DATA__', dispensaries };
|
|
}
|
|
} catch {
|
|
// Ignore parse errors
|
|
}
|
|
}
|
|
|
|
// Try window variables
|
|
const win = window as any;
|
|
if (win.__APOLLO_STATE__) {
|
|
// Extract from Apollo cache
|
|
const entries = Object.entries(win.__APOLLO_STATE__).filter(
|
|
([key]) => key.startsWith('Dispensary:')
|
|
);
|
|
if (entries.length > 0) {
|
|
return { source: 'APOLLO_STATE', dispensaries: entries.map(([, v]) => v) };
|
|
}
|
|
}
|
|
|
|
return { source: 'none', dispensaries: [] };
|
|
});
|
|
|
|
if (verbose) {
|
|
console.log(`[LocationDiscovery] Page data source: ${data.source}, count: ${data.dispensaries.length}`);
|
|
}
|
|
|
|
return data.dispensaries.map((d: any) => normalizeLocationResponse(d));
|
|
} catch (error: any) {
|
|
if (verbose) {
|
|
console.log(`[LocationDiscovery] Could not extract from page data: ${error.message}`);
|
|
}
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Fetch locations via GraphQL geo-based query.
|
|
*
|
|
* Uses ConsumerDispensaries with geo filtering:
|
|
* - dispensaryFilter.nearLat/nearLng for center point
|
|
* - dispensaryFilter.distance for radius in miles
|
|
* - Response at data.filteredDispensaries
|
|
*/
|
|
async function fetchLocationsViaGraphQL(
|
|
session: SessionCredentials,
|
|
city: DiscoveryCity,
|
|
verbose: boolean
|
|
): Promise<DutchieLocationResponse[]> {
|
|
// City center coordinates with appropriate radius
|
|
const CITY_COORDS: Record<string, { lat: number; lng: number; radius: number }> = {
|
|
'phoenix': { lat: 33.4484, lng: -112.074, radius: 50 },
|
|
'tucson': { lat: 32.2226, lng: -110.9747, radius: 50 },
|
|
'scottsdale': { lat: 33.4942, lng: -111.9261, radius: 30 },
|
|
'mesa': { lat: 33.4152, lng: -111.8315, radius: 30 },
|
|
'tempe': { lat: 33.4255, lng: -111.94, radius: 30 },
|
|
'flagstaff': { lat: 35.1983, lng: -111.6513, radius: 50 },
|
|
};
|
|
|
|
// State-wide coordinates for full coverage
|
|
const STATE_COORDS: Record<string, { lat: number; lng: number; radius: number }> = {
|
|
'AZ': { lat: 33.4484, lng: -112.074, radius: 200 },
|
|
'CA': { lat: 36.7783, lng: -119.4179, radius: 400 },
|
|
'CO': { lat: 39.5501, lng: -105.7821, radius: 200 },
|
|
'FL': { lat: 27.6648, lng: -81.5158, radius: 400 },
|
|
'MI': { lat: 44.3148, lng: -85.6024, radius: 250 },
|
|
'NV': { lat: 36.1699, lng: -115.1398, radius: 200 },
|
|
};
|
|
|
|
// Try city-specific coords first, then state-wide, then default
|
|
const coords = CITY_COORDS[city.citySlug]
|
|
|| (city.stateCode && STATE_COORDS[city.stateCode])
|
|
|| { lat: 33.4484, lng: -112.074, radius: 200 };
|
|
|
|
// Correct GraphQL variables for ConsumerDispensaries
|
|
const variables = {
|
|
dispensaryFilter: {
|
|
activeOnly: true,
|
|
nearLat: coords.lat,
|
|
nearLng: coords.lng,
|
|
distance: coords.radius,
|
|
},
|
|
page: 0,
|
|
perPage: 200,
|
|
};
|
|
|
|
const hash = '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b';
|
|
|
|
try {
|
|
const response = await axios.post(
|
|
'https://dutchie.com/api-3/graphql',
|
|
{
|
|
operationName: 'ConsumerDispensaries',
|
|
variables,
|
|
extensions: {
|
|
persistedQuery: { version: 1, sha256Hash: hash },
|
|
},
|
|
},
|
|
{
|
|
headers: {
|
|
'content-type': 'application/json',
|
|
'origin': 'https://dutchie.com',
|
|
'referer': `https://dutchie.com/dispensaries/${city.stateCode?.toLowerCase()}/${city.citySlug}`,
|
|
'user-agent': session.userAgent,
|
|
'cookie': session.cookies,
|
|
},
|
|
timeout: 30000,
|
|
validateStatus: () => true,
|
|
}
|
|
);
|
|
|
|
if (response.status !== 200) {
|
|
if (verbose) {
|
|
console.log(`[LocationDiscovery] GraphQL returned ${response.status}`);
|
|
}
|
|
return [];
|
|
}
|
|
|
|
// Response is at data.filteredDispensaries
|
|
const dispensaries = response.data?.data?.filteredDispensaries || [];
|
|
|
|
// Filter to specific state if needed (radius may include neighboring states)
|
|
const filtered = city.stateCode
|
|
? dispensaries.filter((d: any) => d.location?.state === city.stateCode)
|
|
: dispensaries;
|
|
|
|
if (verbose) {
|
|
console.log(`[LocationDiscovery] GraphQL returned ${dispensaries.length} total, ${filtered.length} in ${city.stateCode || 'all states'}`);
|
|
}
|
|
|
|
return filtered.map((d: any) => normalizeLocationResponse(d));
|
|
} catch (error: any) {
|
|
if (verbose) {
|
|
console.log(`[LocationDiscovery] GraphQL error: ${error.message}`);
|
|
}
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Scrape location cards from the visible page.
|
|
*/
|
|
async function scrapeLocationCards(
|
|
page: Page,
|
|
verbose: boolean
|
|
): Promise<DutchieLocationResponse[]> {
|
|
try {
|
|
const locations = await page.evaluate(() => {
|
|
const cards: any[] = [];
|
|
|
|
// Look for common dispensary card patterns
|
|
const selectors = [
|
|
'[data-testid="dispensary-card"]',
|
|
'.dispensary-card',
|
|
'a[href*="/dispensary/"]',
|
|
'[class*="DispensaryCard"]',
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
if (elements.length > 0) {
|
|
elements.forEach((el) => {
|
|
const link = el.querySelector('a')?.href || (el as HTMLAnchorElement).href || '';
|
|
const name = el.querySelector('h2, h3, [class*="name"]')?.textContent?.trim() || '';
|
|
const address = el.querySelector('[class*="address"], address')?.textContent?.trim() || '';
|
|
|
|
// Extract slug from URL
|
|
const slugMatch = link.match(/\/dispensary\/([^/?]+)/);
|
|
const slug = slugMatch ? slugMatch[1] : '';
|
|
|
|
if (slug && name) {
|
|
cards.push({
|
|
slug,
|
|
name,
|
|
address,
|
|
menuUrl: link,
|
|
});
|
|
}
|
|
});
|
|
break; // Stop after first successful selector
|
|
}
|
|
}
|
|
|
|
return cards;
|
|
});
|
|
|
|
return locations.map((d: any) => ({
|
|
id: '',
|
|
name: d.name,
|
|
slug: d.slug,
|
|
address: d.address,
|
|
menuUrl: d.menuUrl,
|
|
}));
|
|
} catch (error: any) {
|
|
if (verbose) {
|
|
console.log(`[LocationDiscovery] Scraping error: ${error.message}`);
|
|
}
|
|
return [];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Normalize a raw location response to a consistent format.
|
|
* Maps Dutchie camelCase fields to our snake_case equivalents.
|
|
*/
|
|
function normalizeLocationResponse(raw: any): DutchieLocationResponse {
|
|
const slug = raw.slug || raw.cName || raw.urlSlug || '';
|
|
const id = raw.id || raw._id || raw.dispensaryId || '';
|
|
|
|
// Extract location data - GraphQL response nests address info in .location
|
|
const loc = raw.location || {};
|
|
|
|
// Extract coordinates from geometry.coordinates [longitude, latitude]
|
|
const coords = loc.geometry?.coordinates || [];
|
|
const longitude = coords[0] || raw.longitude || raw.lng || loc.longitude || loc.lng;
|
|
const latitude = coords[1] || raw.latitude || raw.lat || loc.latitude || loc.lat;
|
|
|
|
return {
|
|
id,
|
|
name: raw.name || raw.dispensaryName || '',
|
|
slug,
|
|
cName: raw.cName || raw.slug || '',
|
|
address: raw.address || raw.fullAddress || loc.ln1 || '',
|
|
address1: raw.address1 || raw.addressLine1 || raw.streetAddress || loc.ln1 || '',
|
|
address2: raw.address2 || raw.addressLine2 || loc.ln2 || '',
|
|
city: raw.city || loc.city || '',
|
|
state: raw.state || raw.stateCode || loc.state || '',
|
|
zip: raw.zip || raw.zipCode || raw.postalCode || loc.zipcode || loc.zip || '',
|
|
country: raw.country || raw.countryCode || loc.country || 'United States',
|
|
latitude,
|
|
longitude,
|
|
timezone: raw.timezone || raw.tz || '',
|
|
menuUrl: raw.menuUrl || (slug ? `https://dutchie.com/dispensary/${slug}` : ''),
|
|
retailType: raw.retailType || raw.type || '',
|
|
// Service offerings
|
|
offerPickup: raw.offerPickup ?? raw.storeSettings?.offerPickup ?? true,
|
|
offerDelivery: raw.offerDelivery ?? raw.storeSettings?.offerDelivery ?? false,
|
|
offerCurbsidePickup: raw.offerCurbsidePickup ?? false,
|
|
// License types
|
|
isRecreational: raw.isRecreational ?? raw.recDispensary ?? raw.retailType?.includes('Recreational') ?? true,
|
|
isMedical: raw.isMedical ?? raw.medicalDispensary ?? raw.retailType?.includes('Medical') ?? true,
|
|
// Contact info
|
|
phone: raw.phone || '',
|
|
email: raw.email || '',
|
|
website: raw.embedBackUrl || '',
|
|
// Branding
|
|
description: raw.description || '',
|
|
logoImage: raw.logoImage || '',
|
|
bannerImage: raw.bannerImage || '',
|
|
// Chain/enterprise info
|
|
chainSlug: raw.chain || '',
|
|
enterpriseId: raw.retailer?.enterpriseId || '',
|
|
// Status
|
|
status: raw.status || '',
|
|
// Preserve raw data
|
|
...raw,
|
|
};
|
|
}
|
|
|
|
// ============================================================
|
|
// DATABASE OPERATIONS
|
|
// ============================================================
|
|
|
|
/**
|
|
* Upsert a location into dutchie_discovery_locations.
|
|
* REQUIRES a valid platform ID (MongoDB ObjectId) - will skip records without one.
|
|
*/
|
|
export async function upsertLocation(
|
|
pool: Pool,
|
|
location: DutchieLocationResponse,
|
|
cityId: number | null
|
|
): Promise<{ id: number; isNew: boolean } | null> {
|
|
// REQUIRE actual platform ID - NO fallback to slug
|
|
const platformLocationId = location.id;
|
|
if (!platformLocationId) {
|
|
console.warn(`[LocationDiscovery] Skipping location without platform ID: ${location.name} (${location.slug})`);
|
|
return null;
|
|
}
|
|
|
|
const menuUrl = location.menuUrl || `https://dutchie.com/dispensary/${location.slug}`;
|
|
|
|
const result = await pool.query(
|
|
`INSERT INTO dutchie_discovery_locations (
|
|
platform,
|
|
platform_location_id,
|
|
platform_slug,
|
|
platform_menu_url,
|
|
name,
|
|
raw_address,
|
|
address_line1,
|
|
address_line2,
|
|
city,
|
|
state_code,
|
|
postal_code,
|
|
country_code,
|
|
latitude,
|
|
longitude,
|
|
timezone,
|
|
discovery_city_id,
|
|
metadata,
|
|
offers_delivery,
|
|
offers_pickup,
|
|
is_recreational,
|
|
is_medical,
|
|
phone,
|
|
website,
|
|
email,
|
|
description,
|
|
logo_image,
|
|
banner_image,
|
|
chain_slug,
|
|
enterprise_id,
|
|
c_name,
|
|
country,
|
|
store_status,
|
|
last_seen_at,
|
|
updated_at
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31, $32, NOW(), NOW())
|
|
ON CONFLICT (platform, platform_location_id)
|
|
DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
platform_menu_url = EXCLUDED.platform_menu_url,
|
|
raw_address = COALESCE(EXCLUDED.raw_address, dutchie_discovery_locations.raw_address),
|
|
address_line1 = COALESCE(EXCLUDED.address_line1, dutchie_discovery_locations.address_line1),
|
|
address_line2 = COALESCE(EXCLUDED.address_line2, dutchie_discovery_locations.address_line2),
|
|
city = COALESCE(EXCLUDED.city, dutchie_discovery_locations.city),
|
|
state_code = COALESCE(EXCLUDED.state_code, dutchie_discovery_locations.state_code),
|
|
postal_code = COALESCE(EXCLUDED.postal_code, dutchie_discovery_locations.postal_code),
|
|
latitude = COALESCE(EXCLUDED.latitude, dutchie_discovery_locations.latitude),
|
|
longitude = COALESCE(EXCLUDED.longitude, dutchie_discovery_locations.longitude),
|
|
timezone = COALESCE(EXCLUDED.timezone, dutchie_discovery_locations.timezone),
|
|
metadata = EXCLUDED.metadata,
|
|
offers_delivery = COALESCE(EXCLUDED.offers_delivery, dutchie_discovery_locations.offers_delivery),
|
|
offers_pickup = COALESCE(EXCLUDED.offers_pickup, dutchie_discovery_locations.offers_pickup),
|
|
is_recreational = COALESCE(EXCLUDED.is_recreational, dutchie_discovery_locations.is_recreational),
|
|
is_medical = COALESCE(EXCLUDED.is_medical, dutchie_discovery_locations.is_medical),
|
|
phone = COALESCE(EXCLUDED.phone, dutchie_discovery_locations.phone),
|
|
website = COALESCE(EXCLUDED.website, dutchie_discovery_locations.website),
|
|
email = COALESCE(EXCLUDED.email, dutchie_discovery_locations.email),
|
|
description = COALESCE(EXCLUDED.description, dutchie_discovery_locations.description),
|
|
logo_image = COALESCE(EXCLUDED.logo_image, dutchie_discovery_locations.logo_image),
|
|
banner_image = COALESCE(EXCLUDED.banner_image, dutchie_discovery_locations.banner_image),
|
|
chain_slug = COALESCE(EXCLUDED.chain_slug, dutchie_discovery_locations.chain_slug),
|
|
enterprise_id = COALESCE(EXCLUDED.enterprise_id, dutchie_discovery_locations.enterprise_id),
|
|
c_name = COALESCE(EXCLUDED.c_name, dutchie_discovery_locations.c_name),
|
|
country = COALESCE(EXCLUDED.country, dutchie_discovery_locations.country),
|
|
store_status = COALESCE(EXCLUDED.store_status, dutchie_discovery_locations.store_status),
|
|
last_seen_at = NOW(),
|
|
updated_at = NOW()
|
|
RETURNING id, (xmax = 0) as is_new`,
|
|
[
|
|
PLATFORM,
|
|
platformLocationId,
|
|
location.slug,
|
|
menuUrl,
|
|
location.name,
|
|
location.address || null,
|
|
location.address1 || null,
|
|
location.address2 || null,
|
|
location.city || null,
|
|
location.state || null,
|
|
location.zip || null,
|
|
location.country || 'United States',
|
|
location.latitude || null,
|
|
location.longitude || null,
|
|
location.timezone || null,
|
|
cityId,
|
|
JSON.stringify(location),
|
|
location.offerDelivery ?? null,
|
|
location.offerPickup ?? null,
|
|
location.isRecreational ?? null,
|
|
location.isMedical ?? null,
|
|
location.phone || null,
|
|
location.website || null,
|
|
location.email || null,
|
|
location.description || null,
|
|
location.logoImage || null,
|
|
location.bannerImage || null,
|
|
location.chainSlug || null,
|
|
location.enterpriseId || null,
|
|
location.cName || null,
|
|
location.country || 'United States',
|
|
location.status || null,
|
|
]
|
|
);
|
|
|
|
return {
|
|
id: result.rows[0].id,
|
|
isNew: result.rows[0].is_new,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get locations by status.
|
|
*/
|
|
export async function getLocationsByStatus(
|
|
pool: Pool,
|
|
status: DiscoveryStatus,
|
|
options: {
|
|
stateCode?: string;
|
|
countryCode?: string;
|
|
limit?: number;
|
|
offset?: number;
|
|
} = {}
|
|
): Promise<DiscoveryLocation[]> {
|
|
const { stateCode, countryCode, limit = 100, offset = 0 } = options;
|
|
|
|
let query = `
|
|
SELECT * FROM dutchie_discovery_locations
|
|
WHERE status = $1 AND active = TRUE
|
|
`;
|
|
const params: any[] = [status];
|
|
let paramIdx = 2;
|
|
|
|
if (stateCode) {
|
|
query += ` AND state_code = $${paramIdx}`;
|
|
params.push(stateCode);
|
|
paramIdx++;
|
|
}
|
|
|
|
if (countryCode) {
|
|
query += ` AND country_code = $${paramIdx}`;
|
|
params.push(countryCode);
|
|
paramIdx++;
|
|
}
|
|
|
|
query += ` ORDER BY first_seen_at DESC LIMIT $${paramIdx} OFFSET $${paramIdx + 1}`;
|
|
params.push(limit, offset);
|
|
|
|
const result = await pool.query<DiscoveryLocationRow>(query, params);
|
|
return result.rows.map(mapLocationRowToLocation);
|
|
}
|
|
|
|
/**
|
|
* Get a location by ID.
|
|
*/
|
|
export async function getLocationById(
|
|
pool: Pool,
|
|
id: number
|
|
): Promise<DiscoveryLocation | null> {
|
|
const result = await pool.query<DiscoveryLocationRow>(
|
|
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
|
|
[id]
|
|
);
|
|
|
|
if (result.rows.length === 0) {
|
|
return null;
|
|
}
|
|
|
|
return mapLocationRowToLocation(result.rows[0]);
|
|
}
|
|
|
|
/**
|
|
* Update location status.
|
|
*/
|
|
export async function updateLocationStatus(
|
|
pool: Pool,
|
|
locationId: number,
|
|
status: DiscoveryStatus,
|
|
options: {
|
|
dispensaryId?: number;
|
|
verifiedBy?: string;
|
|
notes?: string;
|
|
} = {}
|
|
): Promise<void> {
|
|
const { dispensaryId, verifiedBy, notes } = options;
|
|
|
|
await pool.query(
|
|
`UPDATE dutchie_discovery_locations
|
|
SET status = $2,
|
|
dispensary_id = COALESCE($3, dispensary_id),
|
|
verified_at = CASE WHEN $2 IN ('verified', 'merged') THEN NOW() ELSE verified_at END,
|
|
verified_by = COALESCE($4, verified_by),
|
|
notes = COALESCE($5, notes),
|
|
updated_at = NOW()
|
|
WHERE id = $1`,
|
|
[locationId, status, dispensaryId || null, verifiedBy || null, notes || null]
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Search locations by name or address.
|
|
*/
|
|
export async function searchLocations(
|
|
pool: Pool,
|
|
query: string,
|
|
options: {
|
|
status?: DiscoveryStatus;
|
|
stateCode?: string;
|
|
limit?: number;
|
|
} = {}
|
|
): Promise<DiscoveryLocation[]> {
|
|
const { status, stateCode, limit = 50 } = options;
|
|
const searchPattern = `%${query}%`;
|
|
|
|
let sql = `
|
|
SELECT * FROM dutchie_discovery_locations
|
|
WHERE active = TRUE
|
|
AND (name ILIKE $1 OR city ILIKE $1 OR raw_address ILIKE $1 OR platform_slug ILIKE $1)
|
|
`;
|
|
const params: any[] = [searchPattern];
|
|
let paramIdx = 2;
|
|
|
|
if (status) {
|
|
sql += ` AND status = $${paramIdx}`;
|
|
params.push(status);
|
|
paramIdx++;
|
|
}
|
|
|
|
if (stateCode) {
|
|
sql += ` AND state_code = $${paramIdx}`;
|
|
params.push(stateCode);
|
|
paramIdx++;
|
|
}
|
|
|
|
sql += ` ORDER BY name LIMIT $${paramIdx}`;
|
|
params.push(limit);
|
|
|
|
const result = await pool.query<DiscoveryLocationRow>(sql, params);
|
|
return result.rows.map(mapLocationRowToLocation);
|
|
}
|
|
|
|
// ============================================================
|
|
// MAIN DISCOVERY FUNCTION
|
|
// ============================================================
|
|
|
|
/**
|
|
* Discover locations for a specific city.
|
|
*/
|
|
export async function discoverLocationsForCity(
|
|
pool: Pool,
|
|
city: DiscoveryCity,
|
|
options: {
|
|
dryRun?: boolean;
|
|
verbose?: boolean;
|
|
} = {}
|
|
): Promise<LocationDiscoveryResult> {
|
|
const startTime = Date.now();
|
|
const { dryRun = false, verbose = false } = options;
|
|
const errors: string[] = [];
|
|
|
|
console.log(`[LocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
|
|
console.log(`[LocationDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
|
|
|
const locations = await fetchLocationsForCity(city, { verbose });
|
|
|
|
if (locations.length === 0) {
|
|
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
|
|
return {
|
|
cityId: city.id,
|
|
citySlug: city.citySlug,
|
|
locationsFound: 0,
|
|
locationsUpserted: 0,
|
|
locationsNew: 0,
|
|
locationsUpdated: 0,
|
|
errors: [],
|
|
durationMs: Date.now() - startTime,
|
|
};
|
|
}
|
|
|
|
let newCount = 0;
|
|
let updatedCount = 0;
|
|
|
|
for (const location of locations) {
|
|
try {
|
|
if (dryRun) {
|
|
if (verbose) {
|
|
console.log(`[LocationDiscovery][DryRun] Would upsert: ${location.name} (${location.slug})`);
|
|
}
|
|
newCount++;
|
|
continue;
|
|
}
|
|
|
|
const result = await upsertLocation(pool, location, city.id);
|
|
|
|
// Skip locations without valid platform ID
|
|
if (!result) {
|
|
errors.push(`Location ${location.slug}: No valid platform ID - skipped`);
|
|
continue;
|
|
}
|
|
|
|
if (result.isNew) {
|
|
newCount++;
|
|
} else {
|
|
updatedCount++;
|
|
}
|
|
|
|
if (verbose) {
|
|
const action = result.isNew ? 'Created' : 'Updated';
|
|
console.log(`[LocationDiscovery] ${action}: ${location.name} -> ID ${result.id}`);
|
|
}
|
|
} catch (error: any) {
|
|
errors.push(`Location ${location.slug}: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
// Update city crawl status
|
|
if (!dryRun) {
|
|
await pool.query(
|
|
`UPDATE dutchie_discovery_cities
|
|
SET last_crawled_at = NOW(),
|
|
location_count = $2,
|
|
updated_at = NOW()
|
|
WHERE id = $1`,
|
|
[city.id, locations.length]
|
|
);
|
|
}
|
|
|
|
const durationMs = Date.now() - startTime;
|
|
|
|
console.log(`[LocationDiscovery] Complete for ${city.cityName}: ${newCount} new, ${updatedCount} updated, ${errors.length} errors in ${durationMs}ms`);
|
|
|
|
return {
|
|
cityId: city.id,
|
|
citySlug: city.citySlug,
|
|
locationsFound: locations.length,
|
|
locationsUpserted: newCount + updatedCount,
|
|
locationsNew: newCount,
|
|
locationsUpdated: updatedCount,
|
|
errors,
|
|
durationMs,
|
|
};
|
|
}
|