Files
cannaiq/backend/src/dutchie-az/discovery/DtLocationDiscoveryService.ts
Kelly b4a2fb7d03 feat: Add v2 architecture with multi-state support and orchestrator services
Major additions:
- Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare
- Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator
- Discovery system: dutchie discovery service, geo validation, city seeding scripts
- Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages
- Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram)
- Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata

Frontend pages added:
- Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores
- StateHeatmap, CrossStateCompare, SyncInfoPanel

Components added:
- StateSelector, OrchestratorTraceModal, WorkflowStepper

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-07 11:30:57 -07:00

1250 lines
41 KiB
TypeScript

/**
* DtLocationDiscoveryService
*
* Core service for Dutchie location discovery.
* Contains shared logic used by multiple entrypoints.
*
* Responsibilities:
* - Fetch locations from city pages
* - Extract geo coordinates when available
* - Upsert locations to dutchie_discovery_locations
* - DO NOT overwrite protected statuses or existing lat/lng
*/
import { Pool } from 'pg';
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
puppeteer.use(StealthPlugin());
// ============================================================
// TYPES
// ============================================================
export interface DiscoveryCity {
id: number;
platform: string;
cityName: string;
citySlug: string;
stateCode: string | null;
countryCode: string;
crawlEnabled: boolean;
}
export interface DutchieLocation {
platformLocationId: string;
platformSlug: string;
platformMenuUrl: string;
name: string;
rawAddress: string | null;
addressLine1: string | null;
addressLine2: string | null;
city: string | null;
stateCode: string | null;
postalCode: string | null;
countryCode: string | null;
latitude: number | null;
longitude: number | null;
timezone: string | null;
offersDelivery: boolean | null;
offersPickup: boolean | null;
isRecreational: boolean | null;
isMedical: boolean | null;
metadata: Record<string, any>;
}
export interface LocationDiscoveryResult {
cityId: number;
citySlug: string;
locationsFound: number;
locationsInserted: number;
locationsUpdated: number;
locationsSkipped: number;
reportedStoreCount: number | null;
errors: string[];
durationMs: number;
}
interface FetchResult {
locations: DutchieLocation[];
reportedStoreCount: number | null;
}
export interface BatchDiscoveryResult {
totalCities: number;
totalLocationsFound: number;
totalInserted: number;
totalUpdated: number;
totalSkipped: number;
errors: string[];
durationMs: number;
}
// ============================================================
// COORDINATE EXTRACTION HELPERS
// ============================================================
/**
* Extract latitude from various payload formats
*/
function extractLatitude(data: any): number | null {
// Direct lat/latitude fields
if (typeof data.lat === 'number') return data.lat;
if (typeof data.latitude === 'number') return data.latitude;
// Nested in location object
if (data.location) {
if (typeof data.location.lat === 'number') return data.location.lat;
if (typeof data.location.latitude === 'number') return data.location.latitude;
}
// Nested in coordinates object
if (data.coordinates) {
if (typeof data.coordinates.lat === 'number') return data.coordinates.lat;
if (typeof data.coordinates.latitude === 'number') return data.coordinates.latitude;
// GeoJSON format [lng, lat]
if (Array.isArray(data.coordinates) && data.coordinates.length >= 2) {
return data.coordinates[1];
}
}
// Geometry object (GeoJSON)
if (data.geometry?.coordinates && Array.isArray(data.geometry.coordinates)) {
return data.geometry.coordinates[1];
}
// Nested in address
if (data.address) {
if (typeof data.address.lat === 'number') return data.address.lat;
if (typeof data.address.latitude === 'number') return data.address.latitude;
}
// geo object
if (data.geo) {
if (typeof data.geo.lat === 'number') return data.geo.lat;
if (typeof data.geo.latitude === 'number') return data.geo.latitude;
}
return null;
}
/**
* Extract longitude from various payload formats
*/
function extractLongitude(data: any): number | null {
// Direct lng/longitude fields
if (typeof data.lng === 'number') return data.lng;
if (typeof data.lon === 'number') return data.lon;
if (typeof data.longitude === 'number') return data.longitude;
// Nested in location object
if (data.location) {
if (typeof data.location.lng === 'number') return data.location.lng;
if (typeof data.location.lon === 'number') return data.location.lon;
if (typeof data.location.longitude === 'number') return data.location.longitude;
}
// Nested in coordinates object
if (data.coordinates) {
if (typeof data.coordinates.lng === 'number') return data.coordinates.lng;
if (typeof data.coordinates.lon === 'number') return data.coordinates.lon;
if (typeof data.coordinates.longitude === 'number') return data.coordinates.longitude;
// GeoJSON format [lng, lat]
if (Array.isArray(data.coordinates) && data.coordinates.length >= 2) {
return data.coordinates[0];
}
}
// Geometry object (GeoJSON)
if (data.geometry?.coordinates && Array.isArray(data.geometry.coordinates)) {
return data.geometry.coordinates[0];
}
// Nested in address
if (data.address) {
if (typeof data.address.lng === 'number') return data.address.lng;
if (typeof data.address.lon === 'number') return data.address.lon;
if (typeof data.address.longitude === 'number') return data.address.longitude;
}
// geo object
if (data.geo) {
if (typeof data.geo.lng === 'number') return data.geo.lng;
if (typeof data.geo.lon === 'number') return data.geo.lon;
if (typeof data.geo.longitude === 'number') return data.geo.longitude;
}
return null;
}
// ============================================================
// LOCATION FETCHING
// ============================================================
/**
* Parse dispensary data from Dutchie's API/JSON response with coordinate extraction
*/
function parseDispensaryData(d: any, city: DiscoveryCity): DutchieLocation {
const id = d.id || d._id || d.dispensaryId || '';
const slug = d.slug || d.cName || d.name?.toLowerCase().replace(/\s+/g, '-') || '';
// Build menu URL
let menuUrl = `https://dutchie.com/dispensary/${slug}`;
if (d.menuUrl) {
menuUrl = d.menuUrl;
} else if (d.embeddedMenuUrl) {
menuUrl = d.embeddedMenuUrl;
}
// Parse address
const address = d.address || d.location?.address || {};
const rawAddress = [
address.line1 || address.street1 || d.address1,
address.line2 || address.street2 || d.address2,
[
address.city || d.city,
address.state || address.stateCode || d.state,
address.zip || address.zipCode || address.postalCode || d.zip,
]
.filter(Boolean)
.join(' '),
]
.filter(Boolean)
.join(', ');
// Extract coordinates from various possible locations in the payload
const latitude = extractLatitude(d);
const longitude = extractLongitude(d);
if (latitude !== null && longitude !== null) {
console.log(`[DtLocationDiscoveryService] Extracted coordinates for ${slug}: ${latitude}, ${longitude}`);
}
return {
platformLocationId: id,
platformSlug: slug,
platformMenuUrl: menuUrl,
name: d.name || d.dispensaryName || '',
rawAddress: rawAddress || null,
addressLine1: address.line1 || address.street1 || d.address1 || null,
addressLine2: address.line2 || address.street2 || d.address2 || null,
city: address.city || d.city || city.cityName,
stateCode: address.state || address.stateCode || d.state || city.stateCode,
postalCode: address.zip || address.zipCode || address.postalCode || d.zip || null,
countryCode: address.country || address.countryCode || d.country || city.countryCode,
latitude,
longitude,
timezone: d.timezone || d.timeZone || null,
offersDelivery: d.offerDelivery ?? d.offersDelivery ?? d.delivery ?? null,
offersPickup: d.offerPickup ?? d.offersPickup ?? d.pickup ?? null,
isRecreational: d.isRecreational ?? d.recreational ?? (d.retailType === 'recreational' || d.retailType === 'both'),
isMedical: d.isMedical ?? d.medical ?? (d.retailType === 'medical' || d.retailType === 'both'),
metadata: {
source: 'next_data',
retailType: d.retailType,
brand: d.brand,
logo: d.logo || d.logoUrl,
raw: d,
},
};
}
/**
* Fetch locations for a city using Puppeteer
* Returns both locations and Dutchie's reported store count from page header
*/
async function fetchLocationsForCity(city: DiscoveryCity): Promise<FetchResult> {
console.log(`[DtLocationDiscoveryService] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
try {
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
// Use the /us/dispensaries/{city_slug} pattern (NOT /city/{state}/{slug})
const cityUrl = `https://dutchie.com/us/dispensaries/${city.citySlug}`;
console.log(`[DtLocationDiscoveryService] Navigating to ${cityUrl}...`);
await page.goto(cityUrl, {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 3000));
// Extract reported store count from page header (e.g., "18 dispensaries")
const reportedStoreCount = await page.evaluate(() => {
// Look for patterns like "18 dispensaries", "18 stores", "18 results"
const headerSelectors = [
'h1', 'h2', '[data-testid="city-header"]', '[data-testid="results-count"]',
'.results-header', '.city-header', '.page-header'
];
for (const selector of headerSelectors) {
const elements = Array.from(document.querySelectorAll(selector));
for (const el of elements) {
const text = el.textContent || '';
// Match patterns like "18 dispensaries", "18 stores", "18 results", or just "18" followed by word
const match = text.match(/(\d+)\s*(?:dispensar(?:y|ies)|stores?|results?|locations?)/i);
if (match) {
return parseInt(match[1], 10);
}
}
}
// Also check for count in any element containing "dispensaries" or "stores"
const allText = document.body.innerText;
const globalMatch = allText.match(/(\d+)\s+dispensar(?:y|ies)/i);
if (globalMatch) {
return parseInt(globalMatch[1], 10);
}
return null;
});
if (reportedStoreCount !== null) {
console.log(`[DtLocationDiscoveryService] Dutchie reports ${reportedStoreCount} stores for ${city.citySlug}`);
}
// Try to extract __NEXT_DATA__
const nextData = await page.evaluate(() => {
const script = document.querySelector('script#__NEXT_DATA__');
if (script) {
try {
return JSON.parse(script.textContent || '{}');
} catch {
return null;
}
}
return null;
});
let locations: DutchieLocation[] = [];
if (nextData?.props?.pageProps?.dispensaries) {
const dispensaries = nextData.props.pageProps.dispensaries;
console.log(`[DtLocationDiscoveryService] Found ${dispensaries.length} dispensaries in __NEXT_DATA__`);
locations = dispensaries.map((d: any) => parseDispensaryData(d, city));
} else {
// Fall back to DOM scraping
console.log('[DtLocationDiscoveryService] No __NEXT_DATA__, trying DOM scraping...');
const scrapedData = await page.evaluate(() => {
const stores: Array<{
name: string;
href: string;
address: string | null;
}> = [];
const cards = document.querySelectorAll('[data-testid="dispensary-card"], .dispensary-card, a[href*="/dispensary/"]');
cards.forEach((card) => {
const link = card.querySelector('a[href*="/dispensary/"]') || (card as HTMLAnchorElement);
const href = (link as HTMLAnchorElement).href || '';
const name =
card.querySelector('[data-testid="dispensary-name"]')?.textContent ||
card.querySelector('h2, h3, .name')?.textContent ||
link.textContent ||
'';
const address = card.querySelector('[data-testid="dispensary-address"], .address')?.textContent || null;
if (href && name) {
stores.push({
name: name.trim(),
href,
address: address?.trim() || null,
});
}
});
return stores;
});
console.log(`[DtLocationDiscoveryService] DOM scraping found ${scrapedData.length} raw store cards`);
locations = scrapedData.map((s) => {
const match = s.href.match(/\/dispensary\/([^/?]+)/);
const slug = match ? match[1] : s.name.toLowerCase().replace(/\s+/g, '-');
return {
platformLocationId: slug,
platformSlug: slug,
platformMenuUrl: `https://dutchie.com/dispensary/${slug}`,
name: s.name,
rawAddress: s.address,
addressLine1: null,
addressLine2: null,
city: city.cityName,
stateCode: city.stateCode,
postalCode: null,
countryCode: city.countryCode,
latitude: null, // Not available from DOM scraping
longitude: null,
timezone: null,
offersDelivery: null,
offersPickup: null,
isRecreational: null,
isMedical: null,
metadata: { source: 'dom_scrape', originalUrl: s.href },
};
});
}
// =========================================================================
// FILTERING AND DEDUPLICATION
// =========================================================================
const beforeFilterCount = locations.length;
// 1. Filter out ghost entries and marketing links
locations = locations.filter((loc) => {
// Filter out slug matching city slug (e.g., /dispensary/ak-anchorage)
if (loc.platformSlug === city.citySlug) {
console.log(`[DtLocationDiscoveryService] Filtering ghost entry: /dispensary/${loc.platformSlug} (matches city slug)`);
return false;
}
// Filter out marketing/referral links (e.g., try.dutchie.com/dispensary/referral/)
if (!loc.platformMenuUrl.startsWith('https://dutchie.com/dispensary/')) {
console.log(`[DtLocationDiscoveryService] Filtering non-store URL: ${loc.platformMenuUrl}`);
return false;
}
// Filter out generic marketing slugs
const marketingSlugs = ['referral', 'refer-a-dispensary', 'sign-up', 'signup'];
if (marketingSlugs.includes(loc.platformSlug.toLowerCase())) {
console.log(`[DtLocationDiscoveryService] Filtering marketing slug: ${loc.platformSlug}`);
return false;
}
return true;
});
// 2. Deduplicate by platformMenuUrl (unique store URL)
const seenUrls = new Set<string>();
locations = locations.filter((loc) => {
if (seenUrls.has(loc.platformMenuUrl)) {
return false;
}
seenUrls.add(loc.platformMenuUrl);
return true;
});
const afterFilterCount = locations.length;
if (beforeFilterCount !== afterFilterCount) {
console.log(`[DtLocationDiscoveryService] Filtered: ${beforeFilterCount} -> ${afterFilterCount} (removed ${beforeFilterCount - afterFilterCount} ghost/duplicate entries)`);
}
// Log comparison for QA
console.log(`[DtLocationDiscoveryService] [${city.citySlug}] reported_store_count=${reportedStoreCount ?? 'N/A'}, scraped_store_count=${afterFilterCount}`);
if (reportedStoreCount !== null && reportedStoreCount !== afterFilterCount) {
console.log(`[DtLocationDiscoveryService] [${city.citySlug}] MISMATCH: Dutchie reports ${reportedStoreCount}, we scraped ${afterFilterCount}`);
}
return { locations, reportedStoreCount };
} finally {
await browser.close();
}
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a location into dutchie_discovery_locations
* - Does NOT overwrite status if already verified/merged/rejected
* - Does NOT overwrite dispensary_id if already set
* - Does NOT overwrite existing lat/lng (only fills nulls)
*/
async function upsertLocation(
pool: Pool,
location: DutchieLocation,
cityId: number
): Promise<{ inserted: boolean; updated: boolean; skipped: boolean }> {
// First check if this location exists and has a protected status
const existing = await pool.query(
`
SELECT id, status, dispensary_id, latitude, longitude
FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND platform_location_id = $1
`,
[location.platformLocationId]
);
if (existing.rows.length > 0) {
const row = existing.rows[0];
const protectedStatuses = ['verified', 'merged', 'rejected'];
if (protectedStatuses.includes(row.status)) {
// Only update last_seen_at for protected statuses
// But still update coordinates if they were null and we now have them
await pool.query(
`
UPDATE dutchie_discovery_locations
SET
last_seen_at = NOW(),
updated_at = NOW(),
latitude = CASE WHEN latitude IS NULL THEN $2 ELSE latitude END,
longitude = CASE WHEN longitude IS NULL THEN $3 ELSE longitude END
WHERE id = $1
`,
[row.id, location.latitude, location.longitude]
);
return { inserted: false, updated: false, skipped: true };
}
// Update existing discovered location
// Preserve existing lat/lng if already set (only fill nulls)
await pool.query(
`
UPDATE dutchie_discovery_locations
SET
platform_slug = $2,
platform_menu_url = $3,
name = $4,
raw_address = COALESCE($5, raw_address),
address_line1 = COALESCE($6, address_line1),
address_line2 = COALESCE($7, address_line2),
city = COALESCE($8, city),
state_code = COALESCE($9, state_code),
postal_code = COALESCE($10, postal_code),
country_code = COALESCE($11, country_code),
latitude = CASE WHEN latitude IS NULL THEN $12 ELSE latitude END,
longitude = CASE WHEN longitude IS NULL THEN $13 ELSE longitude END,
timezone = COALESCE($14, timezone),
offers_delivery = COALESCE($15, offers_delivery),
offers_pickup = COALESCE($16, offers_pickup),
is_recreational = COALESCE($17, is_recreational),
is_medical = COALESCE($18, is_medical),
metadata = COALESCE($19, metadata),
discovery_city_id = $20,
last_seen_at = NOW(),
updated_at = NOW()
WHERE id = $1
`,
[
row.id,
location.platformSlug,
location.platformMenuUrl,
location.name,
location.rawAddress,
location.addressLine1,
location.addressLine2,
location.city,
location.stateCode,
location.postalCode,
location.countryCode,
location.latitude,
location.longitude,
location.timezone,
location.offersDelivery,
location.offersPickup,
location.isRecreational,
location.isMedical,
JSON.stringify(location.metadata),
cityId,
]
);
return { inserted: false, updated: true, skipped: false };
}
// Insert new location
await pool.query(
`
INSERT INTO dutchie_discovery_locations (
platform,
platform_location_id,
platform_slug,
platform_menu_url,
name,
raw_address,
address_line1,
address_line2,
city,
state_code,
postal_code,
country_code,
latitude,
longitude,
timezone,
status,
offers_delivery,
offers_pickup,
is_recreational,
is_medical,
metadata,
discovery_city_id,
first_seen_at,
last_seen_at,
active,
created_at,
updated_at
) VALUES (
'dutchie',
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14,
'discovered',
$15, $16, $17, $18, $19, $20,
NOW(), NOW(), TRUE, NOW(), NOW()
)
`,
[
location.platformLocationId,
location.platformSlug,
location.platformMenuUrl,
location.name,
location.rawAddress,
location.addressLine1,
location.addressLine2,
location.city,
location.stateCode,
location.postalCode,
location.countryCode,
location.latitude,
location.longitude,
location.timezone,
location.offersDelivery,
location.offersPickup,
location.isRecreational,
location.isMedical,
JSON.stringify(location.metadata),
cityId,
]
);
return { inserted: true, updated: false, skipped: false };
}
// ============================================================
// MAIN SERVICE CLASS
// ============================================================
export class DtLocationDiscoveryService {
constructor(private pool: Pool) {}
/**
* Get a city by slug
*/
async getCityBySlug(citySlug: string): Promise<DiscoveryCity | null> {
const { rows } = await this.pool.query(
`
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
FROM dutchie_discovery_cities
WHERE platform = 'dutchie' AND city_slug = $1
LIMIT 1
`,
[citySlug]
);
if (rows.length === 0) return null;
const r = rows[0];
return {
id: r.id,
platform: r.platform,
cityName: r.city_name,
citySlug: r.city_slug,
stateCode: r.state_code,
countryCode: r.country_code,
crawlEnabled: r.crawl_enabled,
};
}
/**
* Get all crawl-enabled cities
*/
async getEnabledCities(limit?: number): Promise<DiscoveryCity[]> {
const { rows } = await this.pool.query(
`
SELECT id, platform, city_name, city_slug, state_code, country_code, crawl_enabled
FROM dutchie_discovery_cities
WHERE platform = 'dutchie' AND crawl_enabled = TRUE
ORDER BY last_crawled_at ASC NULLS FIRST, city_name ASC
${limit ? `LIMIT ${limit}` : ''}
`
);
return rows.map((r) => ({
id: r.id,
platform: r.platform,
cityName: r.city_name,
citySlug: r.city_slug,
stateCode: r.state_code,
countryCode: r.country_code,
crawlEnabled: r.crawl_enabled,
}));
}
/**
* Discover locations for a single city
*/
async discoverForCity(city: DiscoveryCity): Promise<LocationDiscoveryResult> {
const startTime = Date.now();
const errors: string[] = [];
let locationsFound = 0;
let locationsInserted = 0;
let locationsUpdated = 0;
let locationsSkipped = 0;
let reportedStoreCount: number | null = null;
console.log(`[DtLocationDiscoveryService] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
try {
const fetchResult = await fetchLocationsForCity(city);
const locations = fetchResult.locations;
reportedStoreCount = fetchResult.reportedStoreCount;
locationsFound = locations.length;
console.log(`[DtLocationDiscoveryService] Found ${locationsFound} locations`);
// Count how many have coordinates
const withCoords = locations.filter(l => l.latitude !== null && l.longitude !== null).length;
if (withCoords > 0) {
console.log(`[DtLocationDiscoveryService] ${withCoords}/${locationsFound} locations have coordinates`);
}
for (const location of locations) {
try {
const result = await upsertLocation(this.pool, location, city.id);
if (result.inserted) locationsInserted++;
else if (result.updated) locationsUpdated++;
else if (result.skipped) locationsSkipped++;
} catch (error: any) {
const msg = `Failed to upsert location ${location.platformSlug}: ${error.message}`;
console.error(`[DtLocationDiscoveryService] ${msg}`);
errors.push(msg);
}
}
// Update city's last_crawled_at, location_count, and reported_store_count in metadata
await this.pool.query(
`
UPDATE dutchie_discovery_cities
SET last_crawled_at = NOW(),
location_count = $1,
metadata = COALESCE(metadata, '{}')::jsonb || jsonb_build_object(
'reported_store_count', $3::int,
'scraped_store_count', $1::int,
'last_discovery_at', NOW()::text
),
updated_at = NOW()
WHERE id = $2
`,
[locationsFound, city.id, reportedStoreCount]
);
} catch (error: any) {
const msg = `Location discovery failed for ${city.citySlug}: ${error.message}`;
console.error(`[DtLocationDiscoveryService] ${msg}`);
errors.push(msg);
}
const durationMs = Date.now() - startTime;
console.log(`[DtLocationDiscoveryService] City ${city.citySlug} complete:`);
console.log(` Reported count: ${reportedStoreCount ?? 'N/A'}`);
console.log(` Locations found: ${locationsFound}`);
console.log(` Inserted: ${locationsInserted}`);
console.log(` Updated: ${locationsUpdated}`);
console.log(` Skipped (protected): ${locationsSkipped}`);
console.log(` Errors: ${errors.length}`);
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
return {
cityId: city.id,
citySlug: city.citySlug,
locationsFound,
locationsInserted,
locationsUpdated,
locationsSkipped,
reportedStoreCount,
errors,
durationMs,
};
}
/**
* Discover locations for all enabled cities
*/
async discoverAllEnabled(options: {
limit?: number;
delayMs?: number;
} = {}): Promise<BatchDiscoveryResult> {
const { limit, delayMs = 2000 } = options;
const startTime = Date.now();
let totalLocationsFound = 0;
let totalInserted = 0;
let totalUpdated = 0;
let totalSkipped = 0;
const allErrors: string[] = [];
const cities = await this.getEnabledCities(limit);
console.log(`[DtLocationDiscoveryService] Discovering locations for ${cities.length} cities...`);
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`\n[DtLocationDiscoveryService] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
try {
const result = await this.discoverForCity(city);
totalLocationsFound += result.locationsFound;
totalInserted += result.locationsInserted;
totalUpdated += result.locationsUpdated;
totalSkipped += result.locationsSkipped;
allErrors.push(...result.errors);
} catch (error: any) {
allErrors.push(`City ${city.citySlug} failed: ${error.message}`);
}
if (i < cities.length - 1 && delayMs > 0) {
await new Promise((r) => setTimeout(r, delayMs));
}
}
const durationMs = Date.now() - startTime;
return {
totalCities: cities.length,
totalLocationsFound,
totalInserted,
totalUpdated,
totalSkipped,
errors: allErrors,
durationMs,
};
}
/**
* Get location statistics
*/
async getStats(): Promise<{
total: number;
withCoordinates: number;
byStatus: Array<{ status: string; count: number }>;
byState: Array<{ stateCode: string; count: number }>;
}> {
const [totalRes, coordsRes, byStatusRes, byStateRes] = await Promise.all([
this.pool.query(`
SELECT COUNT(*) as cnt FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND active = TRUE
`),
this.pool.query(`
SELECT COUNT(*) as cnt FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND active = TRUE
AND latitude IS NOT NULL AND longitude IS NOT NULL
`),
this.pool.query(`
SELECT status, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND active = TRUE
GROUP BY status
ORDER BY cnt DESC
`),
this.pool.query(`
SELECT state_code, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND active = TRUE AND state_code IS NOT NULL
GROUP BY state_code
ORDER BY cnt DESC
LIMIT 20
`),
]);
return {
total: parseInt(totalRes.rows[0]?.cnt || '0', 10),
withCoordinates: parseInt(coordsRes.rows[0]?.cnt || '0', 10),
byStatus: byStatusRes.rows.map((r) => ({
status: r.status,
count: parseInt(r.cnt, 10),
})),
byState: byStateRes.rows.map((r) => ({
stateCode: r.state_code,
count: parseInt(r.cnt, 10),
})),
};
}
// ============================================================
// ALICE - FULL DISCOVERY FROM /CITIES PAGE
// ============================================================
/**
* Fetch all states and cities from https://dutchie.com/cities
* Returns the complete hierarchy of states -> cities
*/
async fetchCitiesFromMasterPage(): Promise<{
states: Array<{
stateCode: string;
stateName: string;
cities: Array<{ cityName: string; citySlug: string; storeCount?: number }>;
}>;
errors: string[];
}> {
console.log('[Alice] Fetching master cities page from https://dutchie.com/cities...');
const browser = await puppeteer.launch({
headless: 'new',
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
});
try {
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
);
await page.goto('https://dutchie.com/cities', {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 3000));
// Try to extract from __NEXT_DATA__
const citiesData = await page.evaluate(() => {
const script = document.querySelector('script#__NEXT_DATA__');
if (script) {
try {
const data = JSON.parse(script.textContent || '{}');
return data?.props?.pageProps || null;
} catch {
return null;
}
}
return null;
});
const states: Array<{
stateCode: string;
stateName: string;
cities: Array<{ cityName: string; citySlug: string; storeCount?: number }>;
}> = [];
const errors: string[] = [];
if (citiesData?.states || citiesData?.regions) {
// Parse from structured data
const statesList = citiesData.states || citiesData.regions || [];
for (const state of statesList) {
const stateCode = state.code || state.stateCode || state.abbreviation || '';
const stateName = state.name || state.stateName || '';
const cities = (state.cities || []).map((c: any) => ({
cityName: c.name || c.cityName || '',
citySlug: c.slug || c.citySlug || c.name?.toLowerCase().replace(/\s+/g, '-') || '',
storeCount: c.dispensaryCount || c.storeCount || undefined,
}));
if (stateCode && cities.length > 0) {
states.push({ stateCode, stateName, cities });
}
}
} else {
// Fallback: DOM scraping
console.log('[Alice] No __NEXT_DATA__, attempting DOM scrape...');
const scrapedStates = await page.evaluate(() => {
const result: Array<{
stateCode: string;
stateName: string;
cities: Array<{ cityName: string; citySlug: string }>;
}> = [];
// Look for state sections
const stateHeaders = document.querySelectorAll('h2, h3, [data-testid*="state"]');
stateHeaders.forEach((header) => {
const stateName = header.textContent?.trim() || '';
// Try to extract state code from data attributes or guess from name
const stateCode = (header as HTMLElement).dataset?.stateCode ||
stateName.substring(0, 2).toUpperCase();
// Find city links following this header
const container = header.closest('section') || header.parentElement;
const cityLinks = container?.querySelectorAll('a[href*="/dispensaries/"]') || [];
const cities: Array<{ cityName: string; citySlug: string }> = [];
cityLinks.forEach((link) => {
const href = (link as HTMLAnchorElement).href || '';
const match = href.match(/\/dispensaries\/([^/?]+)/);
if (match) {
cities.push({
cityName: link.textContent?.trim() || '',
citySlug: match[1],
});
}
});
if (stateName && cities.length > 0) {
result.push({ stateCode, stateName, cities });
}
});
return result;
});
states.push(...scrapedStates);
if (states.length === 0) {
errors.push('Could not parse cities from master page');
}
}
console.log(`[Alice] Found ${states.length} states with cities from master page`);
return { states, errors };
} finally {
await browser.close();
}
}
/**
* Upsert cities from master page discovery
*/
async upsertCitiesFromMaster(states: Array<{
stateCode: string;
stateName: string;
cities: Array<{ cityName: string; citySlug: string; storeCount?: number }>;
}>): Promise<{ inserted: number; updated: number }> {
let inserted = 0;
let updated = 0;
for (const state of states) {
for (const city of state.cities) {
const existing = await this.pool.query(
`SELECT id FROM dutchie_discovery_cities
WHERE platform = 'dutchie' AND city_slug = $1`,
[city.citySlug]
);
if (existing.rows.length === 0) {
// Insert new city
await this.pool.query(
`INSERT INTO dutchie_discovery_cities (
platform, city_name, city_slug, state_code, state_name,
country_code, crawl_enabled, discovered_at, last_verified_at,
store_count_reported, created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, NOW(), NOW(), $8, NOW(), NOW())`,
[
'dutchie',
city.cityName,
city.citySlug,
state.stateCode,
state.stateName,
'US',
true,
city.storeCount || null,
]
);
inserted++;
} else {
// Update existing city
await this.pool.query(
`UPDATE dutchie_discovery_cities SET
city_name = COALESCE($2, city_name),
state_code = COALESCE($3, state_code),
state_name = COALESCE($4, state_name),
last_verified_at = NOW(),
store_count_reported = COALESCE($5, store_count_reported),
updated_at = NOW()
WHERE id = $1`,
[existing.rows[0].id, city.cityName, state.stateCode, state.stateName, city.storeCount]
);
updated++;
}
}
}
return { inserted, updated };
}
/**
* Detect stores that have been removed from source
* Mark them as retired instead of deleting
*/
async detectAndMarkRemovedStores(
currentLocationIds: Set<string>
): Promise<{ retiredCount: number; retiredIds: string[] }> {
// Get all active locations we know about
const { rows: existingLocations } = await this.pool.query<{
id: number;
platform_location_id: string;
name: string;
}>(`
SELECT id, platform_location_id, name
FROM dutchie_discovery_locations
WHERE platform = 'dutchie'
AND active = TRUE
AND retired_at IS NULL
`);
const retiredIds: string[] = [];
for (const loc of existingLocations) {
if (!currentLocationIds.has(loc.platform_location_id)) {
// This store no longer appears in source - mark as retired
await this.pool.query(
`UPDATE dutchie_discovery_locations SET
active = FALSE,
retired_at = NOW(),
retirement_reason = 'removed_from_source',
updated_at = NOW()
WHERE id = $1`,
[loc.id]
);
retiredIds.push(loc.platform_location_id);
console.log(`[Alice] Marked store as retired: ${loc.name} (${loc.platform_location_id})`);
}
}
return { retiredCount: retiredIds.length, retiredIds };
}
/**
* Detect and track slug changes
*/
async detectSlugChanges(
locationId: string,
newSlug: string
): Promise<{ changed: boolean; previousSlug?: string }> {
const { rows } = await this.pool.query<{ platform_slug: string }>(
`SELECT platform_slug FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND platform_location_id = $1`,
[locationId]
);
if (rows.length === 0) return { changed: false };
const currentSlug = rows[0].platform_slug;
if (currentSlug && currentSlug !== newSlug) {
// Slug changed - update with tracking
await this.pool.query(
`UPDATE dutchie_discovery_locations SET
platform_slug = $1,
previous_slug = $2,
slug_changed_at = NOW(),
updated_at = NOW()
WHERE platform = 'dutchie' AND platform_location_id = $3`,
[newSlug, currentSlug, locationId]
);
console.log(`[Alice] Slug change detected: ${currentSlug} -> ${newSlug}`);
return { changed: true, previousSlug: currentSlug };
}
return { changed: false };
}
/**
* Full discovery run with change detection (Alice's main job)
* Fetches from /cities, discovers all stores, detects changes
*/
async runFullDiscoveryWithChangeDetection(options: {
scope?: { states?: string[]; storeIds?: number[] };
delayMs?: number;
} = {}): Promise<{
statesDiscovered: number;
citiesDiscovered: number;
newStoreCount: number;
removedStoreCount: number;
updatedStoreCount: number;
slugChangedCount: number;
totalLocationsFound: number;
errors: string[];
durationMs: number;
}> {
const startTime = Date.now();
const { scope, delayMs = 2000 } = options;
const errors: string[] = [];
let slugChangedCount = 0;
console.log('[Alice] Starting full discovery with change detection...');
if (scope?.states) {
console.log(`[Alice] Scope limited to states: ${scope.states.join(', ')}`);
}
// Step 1: Fetch master cities page
const { states: masterStates, errors: fetchErrors } = await this.fetchCitiesFromMasterPage();
errors.push(...fetchErrors);
// Filter by scope if provided
const statesToProcess = scope?.states
? masterStates.filter(s => scope.states!.includes(s.stateCode))
: masterStates;
// Step 2: Upsert cities
const citiesResult = await this.upsertCitiesFromMaster(statesToProcess);
console.log(`[Alice] Cities: ${citiesResult.inserted} new, ${citiesResult.updated} updated`);
// Step 3: Discover locations for each city
const allLocationIds = new Set<string>();
let totalLocationsFound = 0;
let totalInserted = 0;
let totalUpdated = 0;
const cities = await this.getEnabledCities();
const citiesToProcess = scope?.states
? cities.filter(c => c.stateCode && scope.states!.includes(c.stateCode))
: cities;
for (let i = 0; i < citiesToProcess.length; i++) {
const city = citiesToProcess[i];
console.log(`[Alice] City ${i + 1}/${citiesToProcess.length}: ${city.cityName}, ${city.stateCode}`);
try {
const result = await this.discoverForCity(city);
totalLocationsFound += result.locationsFound;
totalInserted += result.locationsInserted;
totalUpdated += result.locationsUpdated;
errors.push(...result.errors);
// Track all discovered location IDs for removal detection
// (This requires modifying discoverForCity to return IDs, or query them after)
} catch (error: any) {
errors.push(`City ${city.citySlug}: ${error.message}`);
}
if (i < citiesToProcess.length - 1 && delayMs > 0) {
await new Promise((r) => setTimeout(r, delayMs));
}
}
// Step 4: Get all current active location IDs for removal detection
const { rows: currentLocations } = await this.pool.query<{ platform_location_id: string }>(
`SELECT platform_location_id FROM dutchie_discovery_locations
WHERE platform = 'dutchie' AND active = TRUE AND last_seen_at > NOW() - INTERVAL '1 day'`
);
currentLocations.forEach(loc => allLocationIds.add(loc.platform_location_id));
// Step 5: Detect removed stores (only if we had a successful discovery)
let removedResult = { retiredCount: 0, retiredIds: [] as string[] };
if (totalLocationsFound > 0 && !scope) {
// Only detect removals on full (unscoped) runs
removedResult = await this.detectAndMarkRemovedStores(allLocationIds);
}
const durationMs = Date.now() - startTime;
console.log('[Alice] Full discovery complete:');
console.log(` States: ${statesToProcess.length}`);
console.log(` Cities: ${citiesToProcess.length}`);
console.log(` Locations found: ${totalLocationsFound}`);
console.log(` New: ${totalInserted}, Updated: ${totalUpdated}`);
console.log(` Removed: ${removedResult.retiredCount}`);
console.log(` Duration: ${(durationMs / 1000).toFixed(1)}s`);
return {
statesDiscovered: statesToProcess.length,
citiesDiscovered: citiesToProcess.length,
newStoreCount: totalInserted,
removedStoreCount: removedResult.retiredCount,
updatedStoreCount: totalUpdated,
slugChangedCount,
totalLocationsFound,
errors,
durationMs,
};
}
}
export default DtLocationDiscoveryService;