feat: Add v2 architecture with multi-state support and orchestrator services

Major additions:
- Multi-state expansion: states table, StateSelector, NationalDashboard, StateHeatmap, CrossStateCompare
- Orchestrator services: trace service, error taxonomy, retry manager, proxy rotator
- Discovery system: dutchie discovery service, geo validation, city seeding scripts
- Analytics infrastructure: analytics v2 routes, brand/pricing/stores intelligence pages
- Local development: setup-local.sh starts all 5 services (postgres, backend, cannaiq, findadispo, findagram)
- Migrations 037-056: crawler profiles, states, analytics indexes, worker metadata

Frontend pages added:
- Discovery, ChainsDashboard, IntelligenceBrands, IntelligencePricing, IntelligenceStores
- StateHeatmap, CrossStateCompare, SyncInfoPanel

Components added:
- StateSelector, OrchestratorTraceModal, WorkflowStepper

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-07 11:30:57 -07:00
parent 8ac64ba077
commit b4a2fb7d03
248 changed files with 60714 additions and 666 deletions

View File

@@ -0,0 +1,474 @@
/**
* Dutchie City Discovery Service
*
* Discovers cities from the Dutchie cities page.
* Each city can contain multiple dispensary locations.
*
* Source: https://dutchie.com/cities
*
* This module ONLY handles city discovery and upserts to dutchie_discovery_cities.
* It does NOT create any dispensary records.
*/
import { Pool } from 'pg';
import axios from 'axios';
import * as cheerio from 'cheerio';
import {
DiscoveryCity,
DiscoveryCityRow,
DutchieCityResponse,
CityDiscoveryResult,
mapCityRowToCity,
} from './types';
const CITIES_PAGE_URL = 'https://dutchie.com/cities';
const PLATFORM = 'dutchie';
// ============================================================
// CITY PAGE SCRAPING
// ============================================================
/**
* Fetch and parse the Dutchie cities page.
* Returns a list of cities with their slugs and states.
*/
export async function fetchCitiesFromPage(): Promise<DutchieCityResponse[]> {
console.log(`[CityDiscovery] Fetching cities from ${CITIES_PAGE_URL}...`);
const response = await axios.get(CITIES_PAGE_URL, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
},
timeout: 30000,
});
const $ = cheerio.load(response.data);
const cities: DutchieCityResponse[] = [];
// Look for city links in various possible structures
// Structure 1: Links in /dispensaries/{state}/{city} format
$('a[href*="/dispensaries/"]').each((_, element) => {
const href = $(element).attr('href') || '';
const text = $(element).text().trim();
// Match /dispensaries/{state}/{city} pattern
const match = href.match(/\/dispensaries\/([a-z]{2,3})\/([a-z0-9-]+)/i);
if (match) {
const [, stateCode, citySlug] = match;
cities.push({
slug: citySlug,
name: text || citySlug.replace(/-/g, ' '),
stateCode: stateCode.toUpperCase(),
countryCode: stateCode.length === 2 ? 'US' : 'CA', // 2-letter = US state, 3+ = Canadian province
});
}
});
// Structure 2: Links in /city/{slug} format
$('a[href*="/city/"]').each((_, element) => {
const href = $(element).attr('href') || '';
const text = $(element).text().trim();
const match = href.match(/\/city\/([a-z0-9-]+)/i);
if (match) {
const [, citySlug] = match;
cities.push({
slug: citySlug,
name: text || citySlug.replace(/-/g, ' '),
});
}
});
// Dedupe by slug
const uniqueCities = new Map<string, DutchieCityResponse>();
for (const city of cities) {
const key = `${city.countryCode || 'unknown'}-${city.stateCode || 'unknown'}-${city.slug}`;
if (!uniqueCities.has(key)) {
uniqueCities.set(key, city);
}
}
const result = Array.from(uniqueCities.values());
console.log(`[CityDiscovery] Found ${result.length} unique cities`);
return result;
}
/**
* Alternative: Fetch cities from Dutchie's internal API/GraphQL
* This is a fallback if the HTML scraping doesn't work.
*/
export async function fetchCitiesFromApi(): Promise<DutchieCityResponse[]> {
console.log('[CityDiscovery] Attempting to fetch cities from API...');
// Try to find the cities endpoint - this is exploratory
// Dutchie may expose cities via their public API
// Common patterns to try:
const possibleEndpoints = [
'https://dutchie.com/api/cities',
'https://dutchie.com/api-3/cities',
'https://api.dutchie.com/v1/cities',
];
for (const endpoint of possibleEndpoints) {
try {
const response = await axios.get(endpoint, {
headers: {
'Accept': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
},
timeout: 10000,
validateStatus: () => true,
});
if (response.status === 200 && Array.isArray(response.data)) {
console.log(`[CityDiscovery] Found cities at ${endpoint}`);
return response.data.map((city: any) => ({
slug: city.slug || city.city_slug,
name: city.name || city.city_name,
stateCode: city.stateCode || city.state_code || city.state,
countryCode: city.countryCode || city.country_code || city.country || 'US',
}));
}
} catch {
// Continue to next endpoint
}
}
console.log('[CityDiscovery] No API endpoint found, falling back to page scraping');
return [];
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a city into dutchie_discovery_cities.
* Returns the city ID.
*/
export async function upsertCity(
pool: Pool,
city: DutchieCityResponse
): Promise<{ id: number; isNew: boolean }> {
const result = await pool.query(
`INSERT INTO dutchie_discovery_cities (
platform,
city_name,
city_slug,
state_code,
country_code,
updated_at
) VALUES ($1, $2, $3, $4, $5, NOW())
ON CONFLICT (platform, country_code, state_code, city_slug)
DO UPDATE SET
city_name = EXCLUDED.city_name,
updated_at = NOW()
RETURNING id, (xmax = 0) as is_new`,
[
PLATFORM,
city.name,
city.slug,
city.stateCode || null,
city.countryCode || 'US',
]
);
return {
id: result.rows[0].id,
isNew: result.rows[0].is_new,
};
}
/**
* Mark a city as crawled and update location count.
*/
export async function markCityCrawled(
pool: Pool,
cityId: number,
locationCount: number
): Promise<void> {
await pool.query(
`UPDATE dutchie_discovery_cities
SET last_crawled_at = NOW(),
location_count = $2,
updated_at = NOW()
WHERE id = $1`,
[cityId, locationCount]
);
}
/**
* Get all cities that need to be crawled.
*/
export async function getCitiesToCrawl(
pool: Pool,
options: {
stateCode?: string;
countryCode?: string;
limit?: number;
onlyStale?: boolean;
staleDays?: number;
} = {}
): Promise<DiscoveryCity[]> {
const {
stateCode,
countryCode,
limit = 100,
onlyStale = false,
staleDays = 7,
} = options;
let query = `
SELECT *
FROM dutchie_discovery_cities
WHERE crawl_enabled = TRUE
`;
const params: any[] = [];
let paramIdx = 1;
if (stateCode) {
query += ` AND state_code = $${paramIdx}`;
params.push(stateCode);
paramIdx++;
}
if (countryCode) {
query += ` AND country_code = $${paramIdx}`;
params.push(countryCode);
paramIdx++;
}
if (onlyStale) {
query += ` AND (last_crawled_at IS NULL OR last_crawled_at < NOW() - INTERVAL '${staleDays} days')`;
}
query += ` ORDER BY last_crawled_at ASC NULLS FIRST LIMIT $${paramIdx}`;
params.push(limit);
const result = await pool.query<DiscoveryCityRow>(query, params);
return result.rows.map(mapCityRowToCity);
}
/**
* Get a city by ID.
*/
export async function getCityById(
pool: Pool,
id: number
): Promise<DiscoveryCity | null> {
const result = await pool.query<DiscoveryCityRow>(
`SELECT * FROM dutchie_discovery_cities WHERE id = $1`,
[id]
);
if (result.rows.length === 0) {
return null;
}
return mapCityRowToCity(result.rows[0]);
}
/**
* Get a city by slug.
*/
export async function getCityBySlug(
pool: Pool,
slug: string,
stateCode?: string,
countryCode: string = 'US'
): Promise<DiscoveryCity | null> {
let query = `
SELECT * FROM dutchie_discovery_cities
WHERE platform = $1 AND city_slug = $2 AND country_code = $3
`;
const params: any[] = [PLATFORM, slug, countryCode];
if (stateCode) {
query += ` AND state_code = $4`;
params.push(stateCode);
}
const result = await pool.query<DiscoveryCityRow>(query, params);
if (result.rows.length === 0) {
return null;
}
return mapCityRowToCity(result.rows[0]);
}
// ============================================================
// MAIN DISCOVERY FUNCTION
// ============================================================
/**
* Run the full city discovery process.
* Fetches cities from Dutchie and upserts them into the database.
*/
export async function discoverCities(
pool: Pool,
options: {
dryRun?: boolean;
verbose?: boolean;
} = {}
): Promise<CityDiscoveryResult> {
const startTime = Date.now();
const { dryRun = false, verbose = false } = options;
const errors: string[] = [];
console.log('[CityDiscovery] Starting city discovery...');
console.log(`[CityDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
// Try API first, fall back to page scraping
let cities = await fetchCitiesFromApi();
if (cities.length === 0) {
cities = await fetchCitiesFromPage();
}
if (cities.length === 0) {
console.log('[CityDiscovery] No cities found');
return {
citiesFound: 0,
citiesUpserted: 0,
citiesSkipped: 0,
errors: ['No cities found from page or API'],
durationMs: Date.now() - startTime,
};
}
let upserted = 0;
let skipped = 0;
for (const city of cities) {
try {
if (dryRun) {
if (verbose) {
console.log(`[CityDiscovery][DryRun] Would upsert: ${city.name} (${city.stateCode}, ${city.countryCode})`);
}
upserted++;
continue;
}
const result = await upsertCity(pool, city);
upserted++;
if (verbose) {
const action = result.isNew ? 'Created' : 'Updated';
console.log(`[CityDiscovery] ${action}: ${city.name} (${city.stateCode}, ${city.countryCode}) -> ID ${result.id}`);
}
} catch (error: any) {
errors.push(`City ${city.slug}: ${error.message}`);
skipped++;
}
}
const durationMs = Date.now() - startTime;
console.log(`[CityDiscovery] Complete: ${upserted} upserted, ${skipped} skipped, ${errors.length} errors in ${durationMs}ms`);
return {
citiesFound: cities.length,
citiesUpserted: upserted,
citiesSkipped: skipped,
errors,
durationMs,
};
}
// ============================================================
// MANUAL CITY SEEDING
// ============================================================
/**
* Seed known cities manually.
* Use this when the cities page doesn't expose all cities.
*/
export async function seedKnownCities(
pool: Pool,
cities: Array<{
name: string;
slug: string;
stateCode: string;
countryCode?: string;
}>
): Promise<{ created: number; updated: number }> {
let created = 0;
let updated = 0;
for (const city of cities) {
const result = await upsertCity(pool, {
name: city.name,
slug: city.slug,
stateCode: city.stateCode,
countryCode: city.countryCode || 'US',
});
if (result.isNew) {
created++;
} else {
updated++;
}
}
return { created, updated };
}
/**
* Pre-defined Arizona cities for seeding.
*/
export const ARIZONA_CITIES = [
{ name: 'Phoenix', slug: 'phoenix', stateCode: 'AZ' },
{ name: 'Tucson', slug: 'tucson', stateCode: 'AZ' },
{ name: 'Mesa', slug: 'mesa', stateCode: 'AZ' },
{ name: 'Chandler', slug: 'chandler', stateCode: 'AZ' },
{ name: 'Scottsdale', slug: 'scottsdale', stateCode: 'AZ' },
{ name: 'Glendale', slug: 'glendale', stateCode: 'AZ' },
{ name: 'Gilbert', slug: 'gilbert', stateCode: 'AZ' },
{ name: 'Tempe', slug: 'tempe', stateCode: 'AZ' },
{ name: 'Peoria', slug: 'peoria', stateCode: 'AZ' },
{ name: 'Surprise', slug: 'surprise', stateCode: 'AZ' },
{ name: 'Yuma', slug: 'yuma', stateCode: 'AZ' },
{ name: 'Avondale', slug: 'avondale', stateCode: 'AZ' },
{ name: 'Flagstaff', slug: 'flagstaff', stateCode: 'AZ' },
{ name: 'Goodyear', slug: 'goodyear', stateCode: 'AZ' },
{ name: 'Lake Havasu City', slug: 'lake-havasu-city', stateCode: 'AZ' },
{ name: 'Buckeye', slug: 'buckeye', stateCode: 'AZ' },
{ name: 'Casa Grande', slug: 'casa-grande', stateCode: 'AZ' },
{ name: 'Sierra Vista', slug: 'sierra-vista', stateCode: 'AZ' },
{ name: 'Maricopa', slug: 'maricopa', stateCode: 'AZ' },
{ name: 'Oro Valley', slug: 'oro-valley', stateCode: 'AZ' },
{ name: 'Prescott', slug: 'prescott', stateCode: 'AZ' },
{ name: 'Bullhead City', slug: 'bullhead-city', stateCode: 'AZ' },
{ name: 'Prescott Valley', slug: 'prescott-valley', stateCode: 'AZ' },
{ name: 'Apache Junction', slug: 'apache-junction', stateCode: 'AZ' },
{ name: 'Marana', slug: 'marana', stateCode: 'AZ' },
{ name: 'El Mirage', slug: 'el-mirage', stateCode: 'AZ' },
{ name: 'Kingman', slug: 'kingman', stateCode: 'AZ' },
{ name: 'Queen Creek', slug: 'queen-creek', stateCode: 'AZ' },
{ name: 'San Luis', slug: 'san-luis', stateCode: 'AZ' },
{ name: 'Sahuarita', slug: 'sahuarita', stateCode: 'AZ' },
{ name: 'Fountain Hills', slug: 'fountain-hills', stateCode: 'AZ' },
{ name: 'Nogales', slug: 'nogales', stateCode: 'AZ' },
{ name: 'Douglas', slug: 'douglas', stateCode: 'AZ' },
{ name: 'Eloy', slug: 'eloy', stateCode: 'AZ' },
{ name: 'Somerton', slug: 'somerton', stateCode: 'AZ' },
{ name: 'Paradise Valley', slug: 'paradise-valley', stateCode: 'AZ' },
{ name: 'Coolidge', slug: 'coolidge', stateCode: 'AZ' },
{ name: 'Cottonwood', slug: 'cottonwood', stateCode: 'AZ' },
{ name: 'Camp Verde', slug: 'camp-verde', stateCode: 'AZ' },
{ name: 'Show Low', slug: 'show-low', stateCode: 'AZ' },
{ name: 'Payson', slug: 'payson', stateCode: 'AZ' },
{ name: 'Sedona', slug: 'sedona', stateCode: 'AZ' },
{ name: 'Winslow', slug: 'winslow', stateCode: 'AZ' },
{ name: 'Globe', slug: 'globe', stateCode: 'AZ' },
{ name: 'Safford', slug: 'safford', stateCode: 'AZ' },
{ name: 'Bisbee', slug: 'bisbee', stateCode: 'AZ' },
{ name: 'Wickenburg', slug: 'wickenburg', stateCode: 'AZ' },
{ name: 'Page', slug: 'page', stateCode: 'AZ' },
{ name: 'Holbrook', slug: 'holbrook', stateCode: 'AZ' },
{ name: 'Willcox', slug: 'willcox', stateCode: 'AZ' },
];

View File

@@ -0,0 +1,327 @@
/**
* Dutchie Discovery Crawler
*
* Main orchestrator for the Dutchie store discovery pipeline.
*
* Flow:
* 1. Discover cities from Dutchie (or use seeded cities)
* 2. For each city, discover store locations
* 3. Upsert all data to discovery tables
* 4. Admin verifies locations manually
* 5. Verified locations are promoted to canonical dispensaries
*
* This module does NOT create canonical dispensaries automatically.
*/
import { Pool } from 'pg';
import {
FullDiscoveryResult,
LocationDiscoveryResult,
DiscoveryCity,
} from './types';
import {
discoverCities,
getCitiesToCrawl,
getCityBySlug,
seedKnownCities,
ARIZONA_CITIES,
} from './city-discovery';
import {
discoverLocationsForCity,
} from './location-discovery';
// ============================================================
// FULL DISCOVERY
// ============================================================
export interface DiscoveryCrawlerOptions {
dryRun?: boolean;
verbose?: boolean;
stateCode?: string;
countryCode?: string;
cityLimit?: number;
skipCityDiscovery?: boolean;
onlyStale?: boolean;
staleDays?: number;
}
/**
* Run the full discovery pipeline:
* 1. Discover/refresh cities
* 2. For each city, discover locations
*/
export async function runFullDiscovery(
pool: Pool,
options: DiscoveryCrawlerOptions = {}
): Promise<FullDiscoveryResult> {
const startTime = Date.now();
const {
dryRun = false,
verbose = false,
stateCode,
countryCode = 'US',
cityLimit = 50,
skipCityDiscovery = false,
onlyStale = true,
staleDays = 7,
} = options;
console.log('='.repeat(60));
console.log('DUTCHIE DISCOVERY CRAWLER');
console.log('='.repeat(60));
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
if (stateCode) console.log(`State: ${stateCode}`);
console.log(`Country: ${countryCode}`);
console.log(`City limit: ${cityLimit}`);
console.log('');
// Step 1: Discover/refresh cities
let cityResult = {
citiesFound: 0,
citiesUpserted: 0,
citiesSkipped: 0,
errors: [] as string[],
durationMs: 0,
};
if (!skipCityDiscovery) {
console.log('[Discovery] Step 1: Discovering cities...');
cityResult = await discoverCities(pool, { dryRun, verbose });
} else {
console.log('[Discovery] Step 1: Skipping city discovery (using existing cities)');
}
// Step 2: Get cities to crawl
console.log('[Discovery] Step 2: Getting cities to crawl...');
const cities = await getCitiesToCrawl(pool, {
stateCode,
countryCode,
limit: cityLimit,
onlyStale,
staleDays,
});
console.log(`[Discovery] Found ${cities.length} cities to crawl`);
// Step 3: Discover locations for each city
console.log('[Discovery] Step 3: Discovering locations...');
const locationResults: LocationDiscoveryResult[] = [];
let totalLocationsFound = 0;
let totalLocationsUpserted = 0;
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`\n[Discovery] City ${i + 1}/${cities.length}: ${city.cityName}, ${city.stateCode}`);
try {
const result = await discoverLocationsForCity(pool, city, { dryRun, verbose });
locationResults.push(result);
totalLocationsFound += result.locationsFound;
totalLocationsUpserted += result.locationsUpserted;
// Rate limiting between cities
if (i < cities.length - 1) {
await new Promise((r) => setTimeout(r, 2000));
}
} catch (error: any) {
console.error(`[Discovery] Error crawling ${city.cityName}: ${error.message}`);
locationResults.push({
cityId: city.id,
citySlug: city.citySlug,
locationsFound: 0,
locationsUpserted: 0,
locationsNew: 0,
locationsUpdated: 0,
errors: [error.message],
durationMs: 0,
});
}
}
const durationMs = Date.now() - startTime;
// Summary
console.log('\n' + '='.repeat(60));
console.log('DISCOVERY COMPLETE');
console.log('='.repeat(60));
console.log(`Duration: ${(durationMs / 1000).toFixed(1)}s`);
console.log('');
console.log('Cities:');
console.log(` Discovered: ${cityResult.citiesFound}`);
console.log(` Upserted: ${cityResult.citiesUpserted}`);
console.log(` Crawled: ${cities.length}`);
console.log('');
console.log('Locations:');
console.log(` Found: ${totalLocationsFound}`);
console.log(` Upserted: ${totalLocationsUpserted}`);
console.log('');
const totalErrors = cityResult.errors.length +
locationResults.reduce((sum, r) => sum + r.errors.length, 0);
if (totalErrors > 0) {
console.log(`Errors: ${totalErrors}`);
}
return {
cities: cityResult,
locations: locationResults,
totalLocationsFound,
totalLocationsUpserted,
durationMs,
};
}
// ============================================================
// SINGLE CITY DISCOVERY
// ============================================================
/**
* Discover locations for a single city by slug.
*/
export async function discoverCity(
pool: Pool,
citySlug: string,
options: {
stateCode?: string;
countryCode?: string;
dryRun?: boolean;
verbose?: boolean;
} = {}
): Promise<LocationDiscoveryResult | null> {
const { stateCode, countryCode = 'US', dryRun = false, verbose = false } = options;
// Find the city
let city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
if (!city) {
// Try to create it if we have enough info
if (stateCode) {
console.log(`[Discovery] City ${citySlug} not found, creating...`);
await seedKnownCities(pool, [{
name: citySlug.replace(/-/g, ' ').replace(/\b\w/g, c => c.toUpperCase()),
slug: citySlug,
stateCode,
countryCode,
}]);
city = await getCityBySlug(pool, citySlug, stateCode, countryCode);
}
if (!city) {
console.log(`[Discovery] City ${citySlug} not found and could not be created`);
return null;
}
}
return await discoverLocationsForCity(pool, city, { dryRun, verbose });
}
// ============================================================
// STATE-WIDE DISCOVERY
// ============================================================
/**
* Seed and discover all cities for a state.
*/
export async function discoverState(
pool: Pool,
stateCode: string,
options: {
dryRun?: boolean;
verbose?: boolean;
cityLimit?: number;
} = {}
): Promise<FullDiscoveryResult> {
const { dryRun = false, verbose = false, cityLimit = 100 } = options;
console.log(`[Discovery] Discovering state: ${stateCode}`);
// Seed known cities for this state
if (stateCode === 'AZ') {
console.log('[Discovery] Seeding Arizona cities...');
const seeded = await seedKnownCities(pool, ARIZONA_CITIES);
console.log(`[Discovery] Seeded ${seeded.created} new cities, ${seeded.updated} updated`);
}
// Run full discovery for this state
return await runFullDiscovery(pool, {
dryRun,
verbose,
stateCode,
countryCode: 'US',
cityLimit,
skipCityDiscovery: true, // Use seeded cities
onlyStale: false, // Crawl all
});
}
// ============================================================
// STATISTICS
// ============================================================
export interface DiscoveryStats {
cities: {
total: number;
crawledLast24h: number;
neverCrawled: number;
};
locations: {
total: number;
discovered: number;
verified: number;
rejected: number;
merged: number;
byState: Array<{ stateCode: string; count: number }>;
};
}
/**
* Get discovery statistics.
*/
export async function getDiscoveryStats(pool: Pool): Promise<DiscoveryStats> {
const [citiesTotal, citiesRecent, citiesNever] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities'),
pool.query(`SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at > NOW() - INTERVAL '24 hours'`),
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_cities WHERE last_crawled_at IS NULL'),
]);
const [locsTotal, locsByStatus, locsByState] = await Promise.all([
pool.query('SELECT COUNT(*) as cnt FROM dutchie_discovery_locations WHERE active = TRUE'),
pool.query(`
SELECT status, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE
GROUP BY status
`),
pool.query(`
SELECT state_code, COUNT(*) as cnt
FROM dutchie_discovery_locations
WHERE active = TRUE AND state_code IS NOT NULL
GROUP BY state_code
ORDER BY cnt DESC
`),
]);
const statusCounts = locsByStatus.rows.reduce((acc, row) => {
acc[row.status] = parseInt(row.cnt, 10);
return acc;
}, {} as Record<string, number>);
return {
cities: {
total: parseInt(citiesTotal.rows[0].cnt, 10),
crawledLast24h: parseInt(citiesRecent.rows[0].cnt, 10),
neverCrawled: parseInt(citiesNever.rows[0].cnt, 10),
},
locations: {
total: parseInt(locsTotal.rows[0].cnt, 10),
discovered: statusCounts.discovered || 0,
verified: statusCounts.verified || 0,
rejected: statusCounts.rejected || 0,
merged: statusCounts.merged || 0,
byState: locsByState.rows.map(row => ({
stateCode: row.state_code,
count: parseInt(row.cnt, 10),
})),
},
};
}

View File

@@ -0,0 +1,37 @@
/**
* Dutchie Discovery Module
*
* Exports all discovery-related functionality for use in the main application.
*/
// Types
export * from './types';
// City Discovery
export {
discoverCities,
getCitiesToCrawl,
getCityBySlug,
seedKnownCities,
ARIZONA_CITIES,
} from './city-discovery';
// Location Discovery
export {
discoverLocationsForCity,
fetchLocationsForCity,
upsertLocation,
} from './location-discovery';
// Discovery Crawler (Orchestrator)
export {
runFullDiscovery,
discoverCity,
discoverState,
getDiscoveryStats,
DiscoveryCrawlerOptions,
DiscoveryStats,
} from './discovery-crawler';
// Routes
export { createDiscoveryRoutes } from './routes';

View File

@@ -0,0 +1,686 @@
/**
* Dutchie Location Discovery Service
*
* Discovers store locations from Dutchie city pages.
* Each city can contain multiple dispensary locations.
*
* This module:
* 1. Fetches location listings for a given city
* 2. Upserts locations into dutchie_discovery_locations
* 3. Does NOT create any canonical dispensary records
*
* Locations remain in "discovered" status until manually verified.
*/
import { Pool } from 'pg';
import axios from 'axios';
import puppeteer from 'puppeteer-extra';
import type { Browser, Page, Protocol } from 'puppeteer';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import {
DiscoveryLocation,
DiscoveryLocationRow,
DutchieLocationResponse,
LocationDiscoveryResult,
DiscoveryStatus,
mapLocationRowToLocation,
} from './types';
import { DiscoveryCity } from './types';
puppeteer.use(StealthPlugin());
const PLATFORM = 'dutchie';
// ============================================================
// GRAPHQL / API FETCHING
// ============================================================
interface SessionCredentials {
cookies: string;
userAgent: string;
browser: Browser;
page: Page;
}
/**
* Create a browser session for fetching location data.
*/
async function createSession(citySlug: string): Promise<SessionCredentials> {
const browser = await puppeteer.launch({
headless: 'new',
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
],
});
const page = await browser.newPage();
const userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
await page.setUserAgent(userAgent);
await page.setViewport({ width: 1920, height: 1080 });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
(window as any).chrome = { runtime: {} };
});
// Navigate to a dispensaries page to get cookies
const url = `https://dutchie.com/dispensaries/az/${citySlug}`;
console.log(`[LocationDiscovery] Loading ${url} to establish session...`);
try {
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 60000,
});
await new Promise((r) => setTimeout(r, 2000));
} catch (error: any) {
console.warn(`[LocationDiscovery] Navigation warning: ${error.message}`);
}
const cookies = await page.cookies();
const cookieString = cookies.map((c: Protocol.Network.Cookie) => `${c.name}=${c.value}`).join('; ');
return { cookies: cookieString, userAgent, browser, page };
}
async function closeSession(session: SessionCredentials): Promise<void> {
await session.browser.close();
}
/**
* Fetch locations for a city using Dutchie's internal search API.
*/
export async function fetchLocationsForCity(
city: DiscoveryCity,
options: {
session?: SessionCredentials;
verbose?: boolean;
} = {}
): Promise<DutchieLocationResponse[]> {
const { verbose = false } = options;
let session = options.session;
let shouldCloseSession = false;
if (!session) {
session = await createSession(city.citySlug);
shouldCloseSession = true;
}
try {
console.log(`[LocationDiscovery] Fetching locations for ${city.cityName}, ${city.stateCode}...`);
// Try multiple approaches to get location data
// Approach 1: Extract from page __NEXT_DATA__ or similar
const locations = await extractLocationsFromPage(session.page, verbose);
if (locations.length > 0) {
console.log(`[LocationDiscovery] Found ${locations.length} locations from page data`);
return locations;
}
// Approach 2: Try the geo-based GraphQL query
const geoLocations = await fetchLocationsViaGraphQL(session, city, verbose);
if (geoLocations.length > 0) {
console.log(`[LocationDiscovery] Found ${geoLocations.length} locations from GraphQL`);
return geoLocations;
}
// Approach 3: Scrape visible location cards
const scrapedLocations = await scrapeLocationCards(session.page, verbose);
if (scrapedLocations.length > 0) {
console.log(`[LocationDiscovery] Found ${scrapedLocations.length} locations from scraping`);
return scrapedLocations;
}
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
return [];
} finally {
if (shouldCloseSession) {
await closeSession(session);
}
}
}
/**
* Extract locations from page's embedded data (__NEXT_DATA__, window.*, etc.)
*/
async function extractLocationsFromPage(
page: Page,
verbose: boolean
): Promise<DutchieLocationResponse[]> {
try {
const data = await page.evaluate(() => {
// Try __NEXT_DATA__
const nextDataEl = document.querySelector('#__NEXT_DATA__');
if (nextDataEl?.textContent) {
try {
const nextData = JSON.parse(nextDataEl.textContent);
// Look for dispensaries in various paths
const dispensaries =
nextData?.props?.pageProps?.dispensaries ||
nextData?.props?.pageProps?.initialDispensaries ||
nextData?.props?.pageProps?.data?.dispensaries ||
[];
if (Array.isArray(dispensaries) && dispensaries.length > 0) {
return { source: '__NEXT_DATA__', dispensaries };
}
} catch {
// Ignore parse errors
}
}
// Try window variables
const win = window as any;
if (win.__APOLLO_STATE__) {
// Extract from Apollo cache
const entries = Object.entries(win.__APOLLO_STATE__).filter(
([key]) => key.startsWith('Dispensary:')
);
if (entries.length > 0) {
return { source: 'APOLLO_STATE', dispensaries: entries.map(([, v]) => v) };
}
}
return { source: 'none', dispensaries: [] };
});
if (verbose) {
console.log(`[LocationDiscovery] Page data source: ${data.source}, count: ${data.dispensaries.length}`);
}
return data.dispensaries.map((d: any) => normalizeLocationResponse(d));
} catch (error: any) {
if (verbose) {
console.log(`[LocationDiscovery] Could not extract from page data: ${error.message}`);
}
return [];
}
}
/**
* Fetch locations via GraphQL geo-based query.
*/
async function fetchLocationsViaGraphQL(
session: SessionCredentials,
city: DiscoveryCity,
verbose: boolean
): Promise<DutchieLocationResponse[]> {
// Use a known center point for the city or default to a central US location
const CITY_COORDS: Record<string, { lat: number; lng: number }> = {
'phoenix': { lat: 33.4484, lng: -112.074 },
'tucson': { lat: 32.2226, lng: -110.9747 },
'scottsdale': { lat: 33.4942, lng: -111.9261 },
'mesa': { lat: 33.4152, lng: -111.8315 },
'tempe': { lat: 33.4255, lng: -111.94 },
'flagstaff': { lat: 35.1983, lng: -111.6513 },
// Add more as needed
};
const coords = CITY_COORDS[city.citySlug] || { lat: 33.4484, lng: -112.074 };
const variables = {
dispensariesFilter: {
latitude: coords.lat,
longitude: coords.lng,
distance: 50, // miles
state: city.stateCode,
city: city.cityName,
},
};
const hash = '0a5bfa6ca1d64ae47bcccb7c8077c87147cbc4e6982c17ceec97a2a4948b311b';
try {
const response = await axios.post(
'https://dutchie.com/api-3/graphql',
{
operationName: 'ConsumerDispensaries',
variables,
extensions: {
persistedQuery: { version: 1, sha256Hash: hash },
},
},
{
headers: {
'content-type': 'application/json',
'origin': 'https://dutchie.com',
'referer': `https://dutchie.com/dispensaries/${city.stateCode?.toLowerCase()}/${city.citySlug}`,
'user-agent': session.userAgent,
'cookie': session.cookies,
},
timeout: 30000,
validateStatus: () => true,
}
);
if (response.status !== 200) {
if (verbose) {
console.log(`[LocationDiscovery] GraphQL returned ${response.status}`);
}
return [];
}
const dispensaries = response.data?.data?.consumerDispensaries || [];
return dispensaries.map((d: any) => normalizeLocationResponse(d));
} catch (error: any) {
if (verbose) {
console.log(`[LocationDiscovery] GraphQL error: ${error.message}`);
}
return [];
}
}
/**
* Scrape location cards from the visible page.
*/
async function scrapeLocationCards(
page: Page,
verbose: boolean
): Promise<DutchieLocationResponse[]> {
try {
const locations = await page.evaluate(() => {
const cards: any[] = [];
// Look for common dispensary card patterns
const selectors = [
'[data-testid="dispensary-card"]',
'.dispensary-card',
'a[href*="/dispensary/"]',
'[class*="DispensaryCard"]',
];
for (const selector of selectors) {
const elements = document.querySelectorAll(selector);
if (elements.length > 0) {
elements.forEach((el) => {
const link = el.querySelector('a')?.href || (el as HTMLAnchorElement).href || '';
const name = el.querySelector('h2, h3, [class*="name"]')?.textContent?.trim() || '';
const address = el.querySelector('[class*="address"], address')?.textContent?.trim() || '';
// Extract slug from URL
const slugMatch = link.match(/\/dispensary\/([^/?]+)/);
const slug = slugMatch ? slugMatch[1] : '';
if (slug && name) {
cards.push({
slug,
name,
address,
menuUrl: link,
});
}
});
break; // Stop after first successful selector
}
}
return cards;
});
return locations.map((d: any) => ({
id: '',
name: d.name,
slug: d.slug,
address: d.address,
menuUrl: d.menuUrl,
}));
} catch (error: any) {
if (verbose) {
console.log(`[LocationDiscovery] Scraping error: ${error.message}`);
}
return [];
}
}
/**
* Normalize a raw location response to a consistent format.
*/
function normalizeLocationResponse(raw: any): DutchieLocationResponse {
const slug = raw.slug || raw.cName || raw.urlSlug || '';
const id = raw.id || raw._id || raw.dispensaryId || '';
return {
id,
name: raw.name || raw.dispensaryName || '',
slug,
address: raw.address || raw.fullAddress || '',
address1: raw.address1 || raw.addressLine1 || raw.streetAddress || '',
address2: raw.address2 || raw.addressLine2 || '',
city: raw.city || '',
state: raw.state || raw.stateCode || '',
zip: raw.zip || raw.zipCode || raw.postalCode || '',
country: raw.country || raw.countryCode || 'US',
latitude: raw.latitude || raw.lat || raw.location?.latitude,
longitude: raw.longitude || raw.lng || raw.location?.longitude,
timezone: raw.timezone || raw.tz || '',
menuUrl: raw.menuUrl || (slug ? `https://dutchie.com/dispensary/${slug}` : ''),
retailType: raw.retailType || raw.type || '',
offerPickup: raw.offerPickup ?? raw.storeSettings?.offerPickup ?? true,
offerDelivery: raw.offerDelivery ?? raw.storeSettings?.offerDelivery ?? false,
isRecreational: raw.isRecreational ?? raw.retailType?.includes('Recreational') ?? true,
isMedical: raw.isMedical ?? raw.retailType?.includes('Medical') ?? true,
// Preserve raw data
...raw,
};
}
// ============================================================
// DATABASE OPERATIONS
// ============================================================
/**
* Upsert a location into dutchie_discovery_locations.
*/
export async function upsertLocation(
pool: Pool,
location: DutchieLocationResponse,
cityId: number | null
): Promise<{ id: number; isNew: boolean }> {
const platformLocationId = location.id || location.slug;
const menuUrl = location.menuUrl || `https://dutchie.com/dispensary/${location.slug}`;
const result = await pool.query(
`INSERT INTO dutchie_discovery_locations (
platform,
platform_location_id,
platform_slug,
platform_menu_url,
name,
raw_address,
address_line1,
address_line2,
city,
state_code,
postal_code,
country_code,
latitude,
longitude,
timezone,
discovery_city_id,
metadata,
offers_delivery,
offers_pickup,
is_recreational,
is_medical,
last_seen_at,
updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, NOW(), NOW())
ON CONFLICT (platform, platform_location_id)
DO UPDATE SET
name = EXCLUDED.name,
platform_menu_url = EXCLUDED.platform_menu_url,
raw_address = COALESCE(EXCLUDED.raw_address, dutchie_discovery_locations.raw_address),
address_line1 = COALESCE(EXCLUDED.address_line1, dutchie_discovery_locations.address_line1),
city = COALESCE(EXCLUDED.city, dutchie_discovery_locations.city),
state_code = COALESCE(EXCLUDED.state_code, dutchie_discovery_locations.state_code),
postal_code = COALESCE(EXCLUDED.postal_code, dutchie_discovery_locations.postal_code),
latitude = COALESCE(EXCLUDED.latitude, dutchie_discovery_locations.latitude),
longitude = COALESCE(EXCLUDED.longitude, dutchie_discovery_locations.longitude),
timezone = COALESCE(EXCLUDED.timezone, dutchie_discovery_locations.timezone),
metadata = EXCLUDED.metadata,
offers_delivery = COALESCE(EXCLUDED.offers_delivery, dutchie_discovery_locations.offers_delivery),
offers_pickup = COALESCE(EXCLUDED.offers_pickup, dutchie_discovery_locations.offers_pickup),
is_recreational = COALESCE(EXCLUDED.is_recreational, dutchie_discovery_locations.is_recreational),
is_medical = COALESCE(EXCLUDED.is_medical, dutchie_discovery_locations.is_medical),
last_seen_at = NOW(),
updated_at = NOW()
RETURNING id, (xmax = 0) as is_new`,
[
PLATFORM,
platformLocationId,
location.slug,
menuUrl,
location.name,
location.address || null,
location.address1 || null,
location.address2 || null,
location.city || null,
location.state || null,
location.zip || null,
location.country || 'US',
location.latitude || null,
location.longitude || null,
location.timezone || null,
cityId,
JSON.stringify(location),
location.offerDelivery ?? null,
location.offerPickup ?? null,
location.isRecreational ?? null,
location.isMedical ?? null,
]
);
return {
id: result.rows[0].id,
isNew: result.rows[0].is_new,
};
}
/**
* Get locations by status.
*/
export async function getLocationsByStatus(
pool: Pool,
status: DiscoveryStatus,
options: {
stateCode?: string;
countryCode?: string;
limit?: number;
offset?: number;
} = {}
): Promise<DiscoveryLocation[]> {
const { stateCode, countryCode, limit = 100, offset = 0 } = options;
let query = `
SELECT * FROM dutchie_discovery_locations
WHERE status = $1 AND active = TRUE
`;
const params: any[] = [status];
let paramIdx = 2;
if (stateCode) {
query += ` AND state_code = $${paramIdx}`;
params.push(stateCode);
paramIdx++;
}
if (countryCode) {
query += ` AND country_code = $${paramIdx}`;
params.push(countryCode);
paramIdx++;
}
query += ` ORDER BY first_seen_at DESC LIMIT $${paramIdx} OFFSET $${paramIdx + 1}`;
params.push(limit, offset);
const result = await pool.query<DiscoveryLocationRow>(query, params);
return result.rows.map(mapLocationRowToLocation);
}
/**
* Get a location by ID.
*/
export async function getLocationById(
pool: Pool,
id: number
): Promise<DiscoveryLocation | null> {
const result = await pool.query<DiscoveryLocationRow>(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
[id]
);
if (result.rows.length === 0) {
return null;
}
return mapLocationRowToLocation(result.rows[0]);
}
/**
* Update location status.
*/
export async function updateLocationStatus(
pool: Pool,
locationId: number,
status: DiscoveryStatus,
options: {
dispensaryId?: number;
verifiedBy?: string;
notes?: string;
} = {}
): Promise<void> {
const { dispensaryId, verifiedBy, notes } = options;
await pool.query(
`UPDATE dutchie_discovery_locations
SET status = $2,
dispensary_id = COALESCE($3, dispensary_id),
verified_at = CASE WHEN $2 IN ('verified', 'merged') THEN NOW() ELSE verified_at END,
verified_by = COALESCE($4, verified_by),
notes = COALESCE($5, notes),
updated_at = NOW()
WHERE id = $1`,
[locationId, status, dispensaryId || null, verifiedBy || null, notes || null]
);
}
/**
* Search locations by name or address.
*/
export async function searchLocations(
pool: Pool,
query: string,
options: {
status?: DiscoveryStatus;
stateCode?: string;
limit?: number;
} = {}
): Promise<DiscoveryLocation[]> {
const { status, stateCode, limit = 50 } = options;
const searchPattern = `%${query}%`;
let sql = `
SELECT * FROM dutchie_discovery_locations
WHERE active = TRUE
AND (name ILIKE $1 OR city ILIKE $1 OR raw_address ILIKE $1 OR platform_slug ILIKE $1)
`;
const params: any[] = [searchPattern];
let paramIdx = 2;
if (status) {
sql += ` AND status = $${paramIdx}`;
params.push(status);
paramIdx++;
}
if (stateCode) {
sql += ` AND state_code = $${paramIdx}`;
params.push(stateCode);
paramIdx++;
}
sql += ` ORDER BY name LIMIT $${paramIdx}`;
params.push(limit);
const result = await pool.query<DiscoveryLocationRow>(sql, params);
return result.rows.map(mapLocationRowToLocation);
}
// ============================================================
// MAIN DISCOVERY FUNCTION
// ============================================================
/**
* Discover locations for a specific city.
*/
export async function discoverLocationsForCity(
pool: Pool,
city: DiscoveryCity,
options: {
dryRun?: boolean;
verbose?: boolean;
} = {}
): Promise<LocationDiscoveryResult> {
const startTime = Date.now();
const { dryRun = false, verbose = false } = options;
const errors: string[] = [];
console.log(`[LocationDiscovery] Discovering locations for ${city.cityName}, ${city.stateCode}...`);
console.log(`[LocationDiscovery] Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
const locations = await fetchLocationsForCity(city, { verbose });
if (locations.length === 0) {
console.log(`[LocationDiscovery] No locations found for ${city.cityName}`);
return {
cityId: city.id,
citySlug: city.citySlug,
locationsFound: 0,
locationsUpserted: 0,
locationsNew: 0,
locationsUpdated: 0,
errors: [],
durationMs: Date.now() - startTime,
};
}
let newCount = 0;
let updatedCount = 0;
for (const location of locations) {
try {
if (dryRun) {
if (verbose) {
console.log(`[LocationDiscovery][DryRun] Would upsert: ${location.name} (${location.slug})`);
}
newCount++;
continue;
}
const result = await upsertLocation(pool, location, city.id);
if (result.isNew) {
newCount++;
} else {
updatedCount++;
}
if (verbose) {
const action = result.isNew ? 'Created' : 'Updated';
console.log(`[LocationDiscovery] ${action}: ${location.name} -> ID ${result.id}`);
}
} catch (error: any) {
errors.push(`Location ${location.slug}: ${error.message}`);
}
}
// Update city crawl status
if (!dryRun) {
await pool.query(
`UPDATE dutchie_discovery_cities
SET last_crawled_at = NOW(),
location_count = $2,
updated_at = NOW()
WHERE id = $1`,
[city.id, locations.length]
);
}
const durationMs = Date.now() - startTime;
console.log(`[LocationDiscovery] Complete for ${city.cityName}: ${newCount} new, ${updatedCount} updated, ${errors.length} errors in ${durationMs}ms`);
return {
cityId: city.id,
citySlug: city.citySlug,
locationsFound: locations.length,
locationsUpserted: newCount + updatedCount,
locationsNew: newCount,
locationsUpdated: updatedCount,
errors,
durationMs,
};
}

View File

@@ -0,0 +1,840 @@
/**
* Dutchie Discovery API Routes
*
* Express routes for the Dutchie store discovery pipeline.
* Provides endpoints for discovering, listing, and verifying locations.
*/
import { Router, Request, Response } from 'express';
import { Pool } from 'pg';
import {
runFullDiscovery,
discoverCity,
discoverState,
getDiscoveryStats,
} from './discovery-crawler';
import {
discoverCities,
getCitiesToCrawl,
getCityBySlug,
seedKnownCities,
ARIZONA_CITIES,
} from './city-discovery';
import {
DiscoveryLocation,
DiscoveryCity,
DiscoveryStatus,
mapLocationRowToLocation,
mapCityRowToCity,
} from './types';
export function createDiscoveryRoutes(pool: Pool): Router {
const router = Router();
// ============================================================
// DISCOVERY LOCATIONS
// ============================================================
/**
* GET /api/discovery/locations
* List discovered locations with filtering
*/
router.get('/locations', async (req: Request, res: Response) => {
try {
const {
status,
stateCode,
countryCode,
city,
platform = 'dutchie',
search,
hasDispensary,
limit = '50',
offset = '0',
} = req.query;
let whereClause = 'WHERE platform = $1 AND active = TRUE';
const params: any[] = [platform];
let paramIndex = 2;
if (status) {
whereClause += ` AND status = $${paramIndex}`;
params.push(status);
paramIndex++;
}
if (stateCode) {
whereClause += ` AND state_code = $${paramIndex}`;
params.push(stateCode);
paramIndex++;
}
if (countryCode) {
whereClause += ` AND country_code = $${paramIndex}`;
params.push(countryCode);
paramIndex++;
}
if (city) {
whereClause += ` AND city ILIKE $${paramIndex}`;
params.push(`%${city}%`);
paramIndex++;
}
if (search) {
whereClause += ` AND (name ILIKE $${paramIndex} OR platform_slug ILIKE $${paramIndex})`;
params.push(`%${search}%`);
paramIndex++;
}
if (hasDispensary === 'true') {
whereClause += ' AND dispensary_id IS NOT NULL';
} else if (hasDispensary === 'false') {
whereClause += ' AND dispensary_id IS NULL';
}
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
const { rows } = await pool.query(
`
SELECT
dl.*,
d.name as dispensary_name,
dc.city_name as discovery_city_name
FROM dutchie_discovery_locations dl
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
LEFT JOIN dutchie_discovery_cities dc ON dl.discovery_city_id = dc.id
${whereClause}
ORDER BY dl.first_seen_at DESC
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
`,
params
);
const { rows: countRows } = await pool.query(
`SELECT COUNT(*) as total FROM dutchie_discovery_locations dl ${whereClause}`,
params.slice(0, -2)
);
const locations = rows.map((row: any) => ({
...mapLocationRowToLocation(row),
dispensaryName: row.dispensary_name,
discoveryCityName: row.discovery_city_name,
}));
res.json({
locations,
total: parseInt(countRows[0]?.total || '0', 10),
limit: parseInt(limit as string, 10),
offset: parseInt(offset as string, 10),
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/discovery/locations/:id
* Get a single discovery location
*/
router.get('/locations/:id', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { rows } = await pool.query(
`
SELECT
dl.*,
d.name as dispensary_name,
d.menu_url as dispensary_menu_url,
dc.city_name as discovery_city_name
FROM dutchie_discovery_locations dl
LEFT JOIN dispensaries d ON dl.dispensary_id = d.id
LEFT JOIN dutchie_discovery_cities dc ON dl.discovery_city_id = dc.id
WHERE dl.id = $1
`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
res.json({
...mapLocationRowToLocation(rows[0]),
dispensaryName: rows[0].dispensary_name,
dispensaryMenuUrl: rows[0].dispensary_menu_url,
discoveryCityName: rows[0].discovery_city_name,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/discovery/locations/pending
* Get locations awaiting verification
*/
router.get('/locations/pending', async (req: Request, res: Response) => {
try {
const { stateCode, countryCode, limit = '100' } = req.query;
let whereClause = `WHERE status = 'discovered' AND active = TRUE`;
const params: any[] = [];
let paramIndex = 1;
if (stateCode) {
whereClause += ` AND state_code = $${paramIndex}`;
params.push(stateCode);
paramIndex++;
}
if (countryCode) {
whereClause += ` AND country_code = $${paramIndex}`;
params.push(countryCode);
paramIndex++;
}
params.push(parseInt(limit as string, 10));
const { rows } = await pool.query(
`
SELECT * FROM dutchie_discovery_locations
${whereClause}
ORDER BY state_code, city, name
LIMIT $${paramIndex}
`,
params
);
res.json({
locations: rows.map(mapLocationRowToLocation),
total: rows.length,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// DISCOVERY CITIES
// ============================================================
/**
* GET /api/discovery/cities
* List discovery cities
*/
router.get('/cities', async (req: Request, res: Response) => {
try {
const {
stateCode,
countryCode,
crawlEnabled,
platform = 'dutchie',
limit = '100',
offset = '0',
} = req.query;
let whereClause = 'WHERE platform = $1';
const params: any[] = [platform];
let paramIndex = 2;
if (stateCode) {
whereClause += ` AND state_code = $${paramIndex}`;
params.push(stateCode);
paramIndex++;
}
if (countryCode) {
whereClause += ` AND country_code = $${paramIndex}`;
params.push(countryCode);
paramIndex++;
}
if (crawlEnabled === 'true') {
whereClause += ' AND crawl_enabled = TRUE';
} else if (crawlEnabled === 'false') {
whereClause += ' AND crawl_enabled = FALSE';
}
params.push(parseInt(limit as string, 10), parseInt(offset as string, 10));
const { rows } = await pool.query(
`
SELECT
dc.*,
(SELECT COUNT(*) FROM dutchie_discovery_locations dl WHERE dl.discovery_city_id = dc.id) as actual_location_count
FROM dutchie_discovery_cities dc
${whereClause}
ORDER BY dc.country_code, dc.state_code, dc.city_name
LIMIT $${paramIndex} OFFSET $${paramIndex + 1}
`,
params
);
const { rows: countRows } = await pool.query(
`SELECT COUNT(*) as total FROM dutchie_discovery_cities dc ${whereClause}`,
params.slice(0, -2)
);
const cities = rows.map((row: any) => ({
...mapCityRowToCity(row),
actualLocationCount: parseInt(row.actual_location_count || '0', 10),
}));
res.json({
cities,
total: parseInt(countRows[0]?.total || '0', 10),
limit: parseInt(limit as string, 10),
offset: parseInt(offset as string, 10),
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// STATISTICS
// ============================================================
/**
* GET /api/discovery/stats
* Get discovery statistics
*/
router.get('/stats', async (_req: Request, res: Response) => {
try {
const stats = await getDiscoveryStats(pool);
res.json(stats);
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// VERIFICATION ACTIONS
// ============================================================
/**
* POST /api/discovery/locations/:id/verify
* Verify a discovered location and create a new canonical dispensary
*/
router.post('/locations/:id/verify', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { verifiedBy = 'admin' } = req.body;
// Get the discovery location
const { rows: locRows } = await pool.query(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (locRows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
const location = locRows[0];
if (location.status !== 'discovered') {
return res.status(400).json({
error: `Location already has status: ${location.status}`,
});
}
// Create the canonical dispensary
const { rows: dispRows } = await pool.query(
`
INSERT INTO dispensaries (
name,
slug,
address,
city,
state,
zip,
latitude,
longitude,
timezone,
menu_type,
menu_url,
platform_dispensary_id,
active,
created_at,
updated_at
) VALUES (
$1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, TRUE, NOW(), NOW()
)
RETURNING id
`,
[
location.name,
location.platform_slug,
location.address_line1,
location.city,
location.state_code,
location.postal_code,
location.latitude,
location.longitude,
location.timezone,
location.platform,
location.platform_menu_url,
location.platform_location_id,
]
);
const dispensaryId = dispRows[0].id;
// Update the discovery location
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'verified',
dispensary_id = $1,
verified_at = NOW(),
verified_by = $2,
updated_at = NOW()
WHERE id = $3
`,
[dispensaryId, verifiedBy, id]
);
res.json({
success: true,
action: 'created',
discoveryId: parseInt(id, 10),
dispensaryId,
message: `Created new dispensary (ID: ${dispensaryId})`,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/locations/:id/link
* Link a discovered location to an existing dispensary
*/
router.post('/locations/:id/link', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { dispensaryId, verifiedBy = 'admin' } = req.body;
if (!dispensaryId) {
return res.status(400).json({ error: 'dispensaryId is required' });
}
// Verify dispensary exists
const { rows: dispRows } = await pool.query(
`SELECT id, name FROM dispensaries WHERE id = $1`,
[dispensaryId]
);
if (dispRows.length === 0) {
return res.status(404).json({ error: 'Dispensary not found' });
}
// Get the discovery location
const { rows: locRows } = await pool.query(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (locRows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
const location = locRows[0];
if (location.status !== 'discovered') {
return res.status(400).json({
error: `Location already has status: ${location.status}`,
});
}
// Update dispensary with platform info if missing
await pool.query(
`
UPDATE dispensaries
SET platform_dispensary_id = COALESCE(platform_dispensary_id, $1),
menu_url = COALESCE(menu_url, $2),
menu_type = COALESCE(menu_type, $3),
updated_at = NOW()
WHERE id = $4
`,
[
location.platform_location_id,
location.platform_menu_url,
location.platform,
dispensaryId,
]
);
// Update the discovery location
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'merged',
dispensary_id = $1,
verified_at = NOW(),
verified_by = $2,
updated_at = NOW()
WHERE id = $3
`,
[dispensaryId, verifiedBy, id]
);
res.json({
success: true,
action: 'linked',
discoveryId: parseInt(id, 10),
dispensaryId,
dispensaryName: dispRows[0].name,
message: `Linked to existing dispensary: ${dispRows[0].name}`,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/locations/:id/reject
* Reject a discovered location
*/
router.post('/locations/:id/reject', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { reason, verifiedBy = 'admin' } = req.body;
const { rows } = await pool.query(
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
if (rows[0].status !== 'discovered') {
return res.status(400).json({
error: `Location already has status: ${rows[0].status}`,
});
}
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'rejected',
verified_at = NOW(),
verified_by = $1,
notes = $2,
updated_at = NOW()
WHERE id = $3
`,
[verifiedBy, reason || 'Rejected by admin', id]
);
res.json({
success: true,
action: 'rejected',
discoveryId: parseInt(id, 10),
message: 'Location rejected',
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/locations/:id/unreject
* Restore a rejected location back to discovered status
*/
router.post('/locations/:id/unreject', async (req: Request, res: Response) => {
try {
const { id } = req.params;
const { rows } = await pool.query(
`SELECT status FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (rows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
if (rows[0].status !== 'rejected') {
return res.status(400).json({
error: `Location is not rejected. Current status: ${rows[0].status}`,
});
}
await pool.query(
`
UPDATE dutchie_discovery_locations
SET status = 'discovered',
verified_at = NULL,
verified_by = NULL,
updated_at = NOW()
WHERE id = $1
`,
[id]
);
res.json({
success: true,
action: 'unrejected',
discoveryId: parseInt(id, 10),
message: 'Location restored to discovered status',
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
// ============================================================
// DISCOVERY ADMIN ACTIONS
// ============================================================
/**
* POST /api/discovery/admin/discover-state
* Run discovery for an entire state
*/
router.post('/admin/discover-state', async (req: Request, res: Response) => {
try {
const { stateCode, dryRun = false, cityLimit = 100 } = req.body;
if (!stateCode) {
return res.status(400).json({ error: 'stateCode is required' });
}
console.log(`[Discovery API] Starting state discovery for ${stateCode}`);
const result = await discoverState(pool, stateCode, {
dryRun,
cityLimit,
verbose: true,
});
res.json({
success: true,
stateCode,
result,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/admin/discover-city
* Run discovery for a single city
*/
router.post('/admin/discover-city', async (req: Request, res: Response) => {
try {
const { citySlug, stateCode, countryCode = 'US', dryRun = false } = req.body;
if (!citySlug) {
return res.status(400).json({ error: 'citySlug is required' });
}
console.log(`[Discovery API] Starting city discovery for ${citySlug}`);
const result = await discoverCity(pool, citySlug, {
stateCode,
countryCode,
dryRun,
verbose: true,
});
if (!result) {
return res.status(404).json({ error: `City not found: ${citySlug}` });
}
res.json({
success: true,
citySlug,
result,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/admin/run-full
* Run full discovery pipeline
*/
router.post('/admin/run-full', async (req: Request, res: Response) => {
try {
const {
stateCode,
countryCode = 'US',
cityLimit = 50,
skipCityDiscovery = false,
onlyStale = true,
staleDays = 7,
dryRun = false,
} = req.body;
console.log(`[Discovery API] Starting full discovery`);
const result = await runFullDiscovery(pool, {
stateCode,
countryCode,
cityLimit,
skipCityDiscovery,
onlyStale,
staleDays,
dryRun,
verbose: true,
});
res.json({
success: true,
result,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* POST /api/discovery/admin/seed-cities
* Seed known cities for a state
*/
router.post('/admin/seed-cities', async (req: Request, res: Response) => {
try {
const { stateCode } = req.body;
if (!stateCode) {
return res.status(400).json({ error: 'stateCode is required' });
}
let cities: any[] = [];
if (stateCode === 'AZ') {
cities = ARIZONA_CITIES;
} else {
return res.status(400).json({
error: `No predefined cities for state: ${stateCode}. Add cities to city-discovery.ts`,
});
}
const result = await seedKnownCities(pool, cities);
res.json({
success: true,
stateCode,
...result,
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
/**
* GET /api/discovery/admin/match-candidates/:id
* Find potential dispensary matches for a discovery location
*/
router.get('/admin/match-candidates/:id', async (req: Request, res: Response) => {
try {
const { id } = req.params;
// Get the discovery location
const { rows: locRows } = await pool.query(
`SELECT * FROM dutchie_discovery_locations WHERE id = $1`,
[parseInt(id, 10)]
);
if (locRows.length === 0) {
return res.status(404).json({ error: 'Location not found' });
}
const location = locRows[0];
// Find potential matches by name similarity and location
const { rows: candidates } = await pool.query(
`
SELECT
d.id,
d.name,
d.city,
d.state,
d.address,
d.menu_type,
d.platform_dispensary_id,
d.menu_url,
d.latitude,
d.longitude,
CASE
WHEN d.name ILIKE $1 THEN 'exact_name'
WHEN d.name ILIKE $2 THEN 'partial_name'
WHEN d.city ILIKE $3 AND d.state = $4 THEN 'same_city'
ELSE 'location_match'
END as match_type,
-- Distance in miles if coordinates available
CASE
WHEN d.latitude IS NOT NULL AND d.longitude IS NOT NULL
AND $5::float IS NOT NULL AND $6::float IS NOT NULL
THEN (3959 * acos(
cos(radians($5::float)) * cos(radians(d.latitude)) *
cos(radians(d.longitude) - radians($6::float)) +
sin(radians($5::float)) * sin(radians(d.latitude))
))
ELSE NULL
END as distance_miles
FROM dispensaries d
WHERE d.state = $4
AND (
d.name ILIKE $1
OR d.name ILIKE $2
OR d.city ILIKE $3
OR (
d.latitude IS NOT NULL
AND d.longitude IS NOT NULL
AND $5::float IS NOT NULL
AND $6::float IS NOT NULL
AND (3959 * acos(
cos(radians($5::float)) * cos(radians(d.latitude)) *
cos(radians(d.longitude) - radians($6::float)) +
sin(radians($5::float)) * sin(radians(d.latitude))
)) < 5
)
)
ORDER BY
CASE
WHEN d.name ILIKE $1 THEN 1
WHEN d.name ILIKE $2 THEN 2
ELSE 3
END,
distance_miles NULLS LAST
LIMIT 10
`,
[
location.name,
`%${location.name.split(' ')[0]}%`,
location.city,
location.state_code,
location.latitude,
location.longitude,
]
);
res.json({
location: mapLocationRowToLocation(location),
candidates: candidates.map((c: any) => ({
id: c.id,
name: c.name,
city: c.city,
state: c.state,
address: c.address,
menuType: c.menu_type,
platformDispensaryId: c.platform_dispensary_id,
menuUrl: c.menu_url,
matchType: c.match_type,
distanceMiles: c.distance_miles ? Math.round(c.distance_miles * 10) / 10 : null,
})),
});
} catch (error: any) {
res.status(500).json({ error: error.message });
}
});
return router;
}
export default createDiscoveryRoutes;

View File

@@ -0,0 +1,269 @@
/**
* Dutchie Discovery Types
*
* Type definitions for the Dutchie store discovery pipeline.
*/
// ============================================================
// DISCOVERY CITY
// ============================================================
export interface DiscoveryCity {
id: number;
platform: string;
cityName: string;
citySlug: string;
stateCode: string | null;
countryCode: string;
lastCrawledAt: Date | null;
crawlEnabled: boolean;
locationCount: number | null;
notes: string | null;
metadata: Record<string, any> | null;
createdAt: Date;
updatedAt: Date;
}
export interface DiscoveryCityRow {
id: number;
platform: string;
city_name: string;
city_slug: string;
state_code: string | null;
country_code: string;
last_crawled_at: Date | null;
crawl_enabled: boolean;
location_count: number | null;
notes: string | null;
metadata: Record<string, any> | null;
created_at: Date;
updated_at: Date;
}
// ============================================================
// DISCOVERY LOCATION
// ============================================================
export type DiscoveryStatus = 'discovered' | 'verified' | 'rejected' | 'merged';
export interface DiscoveryLocation {
id: number;
platform: string;
platformLocationId: string;
platformSlug: string;
platformMenuUrl: string;
name: string;
rawAddress: string | null;
addressLine1: string | null;
addressLine2: string | null;
city: string | null;
stateCode: string | null;
postalCode: string | null;
countryCode: string | null;
latitude: number | null;
longitude: number | null;
timezone: string | null;
status: DiscoveryStatus;
dispensaryId: number | null;
discoveryCityId: number | null;
metadata: Record<string, any> | null;
notes: string | null;
offersDelivery: boolean | null;
offersPickup: boolean | null;
isRecreational: boolean | null;
isMedical: boolean | null;
firstSeenAt: Date;
lastSeenAt: Date;
lastCheckedAt: Date | null;
verifiedAt: Date | null;
verifiedBy: string | null;
active: boolean;
createdAt: Date;
updatedAt: Date;
}
export interface DiscoveryLocationRow {
id: number;
platform: string;
platform_location_id: string;
platform_slug: string;
platform_menu_url: string;
name: string;
raw_address: string | null;
address_line1: string | null;
address_line2: string | null;
city: string | null;
state_code: string | null;
postal_code: string | null;
country_code: string | null;
latitude: number | null;
longitude: number | null;
timezone: string | null;
status: DiscoveryStatus;
dispensary_id: number | null;
discovery_city_id: number | null;
metadata: Record<string, any> | null;
notes: string | null;
offers_delivery: boolean | null;
offers_pickup: boolean | null;
is_recreational: boolean | null;
is_medical: boolean | null;
first_seen_at: Date;
last_seen_at: Date;
last_checked_at: Date | null;
verified_at: Date | null;
verified_by: string | null;
active: boolean;
created_at: Date;
updated_at: Date;
}
// ============================================================
// RAW API RESPONSES
// ============================================================
export interface DutchieCityResponse {
slug: string;
name: string;
state?: string;
stateCode?: string;
country?: string;
countryCode?: string;
}
export interface DutchieLocationResponse {
id: string;
name: string;
slug: string;
address?: string;
address1?: string;
address2?: string;
city?: string;
state?: string;
zip?: string;
zipCode?: string;
country?: string;
latitude?: number;
longitude?: number;
timezone?: string;
menuUrl?: string;
retailType?: string;
offerPickup?: boolean;
offerDelivery?: boolean;
isRecreational?: boolean;
isMedical?: boolean;
// Raw response preserved
[key: string]: any;
}
// ============================================================
// DISCOVERY RESULTS
// ============================================================
export interface CityDiscoveryResult {
citiesFound: number;
citiesUpserted: number;
citiesSkipped: number;
errors: string[];
durationMs: number;
}
export interface LocationDiscoveryResult {
cityId: number;
citySlug: string;
locationsFound: number;
locationsUpserted: number;
locationsNew: number;
locationsUpdated: number;
errors: string[];
durationMs: number;
}
export interface FullDiscoveryResult {
cities: CityDiscoveryResult;
locations: LocationDiscoveryResult[];
totalLocationsFound: number;
totalLocationsUpserted: number;
durationMs: number;
}
// ============================================================
// VERIFICATION
// ============================================================
export interface VerificationResult {
success: boolean;
discoveryId: number;
dispensaryId: number | null;
action: 'created' | 'linked' | 'rejected';
error?: string;
}
export interface PromotionResult {
success: boolean;
discoveryId: number;
dispensaryId: number;
crawlProfileId?: number;
scheduleId?: number;
error?: string;
}
// ============================================================
// MAPPER FUNCTIONS
// ============================================================
export function mapCityRowToCity(row: DiscoveryCityRow): DiscoveryCity {
return {
id: row.id,
platform: row.platform,
cityName: row.city_name,
citySlug: row.city_slug,
stateCode: row.state_code,
countryCode: row.country_code,
lastCrawledAt: row.last_crawled_at,
crawlEnabled: row.crawl_enabled,
locationCount: row.location_count,
notes: row.notes,
metadata: row.metadata,
createdAt: row.created_at,
updatedAt: row.updated_at,
};
}
export function mapLocationRowToLocation(row: DiscoveryLocationRow): DiscoveryLocation {
return {
id: row.id,
platform: row.platform,
platformLocationId: row.platform_location_id,
platformSlug: row.platform_slug,
platformMenuUrl: row.platform_menu_url,
name: row.name,
rawAddress: row.raw_address,
addressLine1: row.address_line1,
addressLine2: row.address_line2,
city: row.city,
stateCode: row.state_code,
postalCode: row.postal_code,
countryCode: row.country_code,
latitude: row.latitude,
longitude: row.longitude,
timezone: row.timezone,
status: row.status,
dispensaryId: row.dispensary_id,
discoveryCityId: row.discovery_city_id,
metadata: row.metadata,
notes: row.notes,
offersDelivery: row.offers_delivery,
offersPickup: row.offers_pickup,
isRecreational: row.is_recreational,
isMedical: row.is_medical,
firstSeenAt: row.first_seen_at,
lastSeenAt: row.last_seen_at,
lastCheckedAt: row.last_checked_at,
verifiedAt: row.verified_at,
verifiedBy: row.verified_by,
active: row.active,
createdAt: row.created_at,
updatedAt: row.updated_at,
};
}