feat: SEO template library, discovery pipeline, and orchestrator enhancements
## SEO Template Library - Add complete template library with 7 page types (state, city, category, brand, product, search, regeneration) - Add Template Library tab in SEO Orchestrator with accordion-based editors - Add template preview, validation, and variable injection engine - Add API endpoints: /api/seo/templates, preview, validate, generate, regenerate ## Discovery Pipeline - Add promotion.ts for discovery location validation and promotion - Add discover-all-states.ts script for multi-state discovery - Add promotion log migration (067) - Enhance discovery routes and types ## Orchestrator & Admin - Add crawl_enabled filter to stores page - Add API permissions page - Add job queue management - Add price analytics routes - Add markets and intelligence routes - Enhance dashboard and worker monitoring ## Infrastructure - Add migrations for worker definitions, SEO settings, field alignment - Add canonical pipeline for scraper v2 - Update hydration and sync orchestrator - Enhance multi-state query service 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
385
backend/src/scripts/discover-all-states.ts
Normal file
385
backend/src/scripts/discover-all-states.ts
Normal file
@@ -0,0 +1,385 @@
|
||||
#!/usr/bin/env npx tsx
|
||||
/**
|
||||
* Discover All States - Sequential State-by-State Dutchie Discovery
|
||||
*
|
||||
* This script discovers all Dutchie dispensaries for every US state,
|
||||
* processing one state at a time with delays between states.
|
||||
*
|
||||
* Progress is automatically saved to /tmp/discovery-progress.json
|
||||
* so the script can resume from where it left off if interrupted.
|
||||
*
|
||||
* Usage:
|
||||
* DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts
|
||||
* DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts --dry-run
|
||||
* DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts --start-from CA
|
||||
* DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts --resume
|
||||
* DATABASE_URL="..." npx tsx src/scripts/discover-all-states.ts --reset # Clear progress, start fresh
|
||||
*
|
||||
* Options:
|
||||
* --dry-run Don't save to database, just show what would happen
|
||||
* --start-from Start from a specific state (skip earlier states)
|
||||
* --states Comma-separated list of specific states to run (e.g., AZ,CA,CO)
|
||||
* --verbose Show detailed output
|
||||
* --resume Auto-resume from last saved progress (default if progress file exists)
|
||||
* --reset Clear progress file and start fresh
|
||||
*/
|
||||
|
||||
import { Pool } from 'pg';
|
||||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
|
||||
const PROGRESS_FILE = '/tmp/discovery-progress.json';
|
||||
|
||||
interface ProgressData {
|
||||
lastCompletedState: string | null;
|
||||
lastCompletedIndex: number;
|
||||
startedAt: string;
|
||||
updatedAt: string;
|
||||
completedStates: string[];
|
||||
}
|
||||
|
||||
function loadProgress(): ProgressData | null {
|
||||
try {
|
||||
if (fs.existsSync(PROGRESS_FILE)) {
|
||||
const data = JSON.parse(fs.readFileSync(PROGRESS_FILE, 'utf-8'));
|
||||
return data;
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('[Progress] Could not load progress file:', e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function saveProgress(progress: ProgressData): void {
|
||||
try {
|
||||
progress.updatedAt = new Date().toISOString();
|
||||
fs.writeFileSync(PROGRESS_FILE, JSON.stringify(progress, null, 2));
|
||||
} catch (e) {
|
||||
console.warn('[Progress] Could not save progress:', e);
|
||||
}
|
||||
}
|
||||
|
||||
function clearProgress(): void {
|
||||
try {
|
||||
if (fs.existsSync(PROGRESS_FILE)) {
|
||||
fs.unlinkSync(PROGRESS_FILE);
|
||||
console.log('[Progress] Cleared progress file');
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('[Progress] Could not clear progress:', e);
|
||||
}
|
||||
}
|
||||
import { discoverState } from '../discovery';
|
||||
|
||||
// US states with legal cannabis (medical or recreational)
|
||||
// Ordered roughly by market size / likelihood of Dutchie presence
|
||||
const US_STATES = [
|
||||
'AZ', // Arizona
|
||||
'CA', // California
|
||||
'CO', // Colorado
|
||||
'FL', // Florida
|
||||
'IL', // Illinois
|
||||
'MA', // Massachusetts
|
||||
'MI', // Michigan
|
||||
'NV', // Nevada
|
||||
'NJ', // New Jersey
|
||||
'NY', // New York
|
||||
'OH', // Ohio
|
||||
'OR', // Oregon
|
||||
'PA', // Pennsylvania
|
||||
'WA', // Washington
|
||||
'MD', // Maryland
|
||||
'MO', // Missouri
|
||||
'CT', // Connecticut
|
||||
'NM', // New Mexico
|
||||
'ME', // Maine
|
||||
'VT', // Vermont
|
||||
'MT', // Montana
|
||||
'AK', // Alaska
|
||||
'OK', // Oklahoma
|
||||
'AR', // Arkansas
|
||||
'ND', // North Dakota
|
||||
'SD', // South Dakota
|
||||
'MN', // Minnesota
|
||||
'NH', // New Hampshire
|
||||
'RI', // Rhode Island
|
||||
'DE', // Delaware
|
||||
'HI', // Hawaii
|
||||
'WV', // West Virginia
|
||||
'LA', // Louisiana
|
||||
'UT', // Utah
|
||||
'VA', // Virginia
|
||||
'DC', // District of Columbia
|
||||
];
|
||||
|
||||
interface DiscoveryResult {
|
||||
stateCode: string;
|
||||
citiesCrawled: number;
|
||||
locationsFound: number;
|
||||
locationsUpserted: number;
|
||||
durationMs: number;
|
||||
errors: string[];
|
||||
}
|
||||
|
||||
function parseArgs() {
|
||||
const args = process.argv.slice(2);
|
||||
const flags: Record<string, string | boolean> = {};
|
||||
|
||||
for (let i = 0; i < args.length; i++) {
|
||||
const arg = args[i];
|
||||
if (arg.startsWith('--')) {
|
||||
const [key, value] = arg.slice(2).split('=');
|
||||
if (value !== undefined) {
|
||||
flags[key] = value;
|
||||
} else if (args[i + 1] && !args[i + 1].startsWith('--')) {
|
||||
flags[key] = args[i + 1];
|
||||
i++;
|
||||
} else {
|
||||
flags[key] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
async function main() {
|
||||
const flags = parseArgs();
|
||||
const dryRun = Boolean(flags['dry-run']);
|
||||
const verbose = Boolean(flags.verbose);
|
||||
const reset = Boolean(flags.reset);
|
||||
const resume = Boolean(flags.resume);
|
||||
let startFrom = flags['start-from'] as string | undefined;
|
||||
const specificStates = flags.states
|
||||
? (flags.states as string).split(',').map((s) => s.trim().toUpperCase())
|
||||
: null;
|
||||
|
||||
// Handle reset flag
|
||||
if (reset) {
|
||||
clearProgress();
|
||||
}
|
||||
|
||||
// Determine which states to process
|
||||
let statesToProcess = specificStates || US_STATES;
|
||||
|
||||
// Check for saved progress (auto-resume unless --reset or --start-from specified)
|
||||
const savedProgress = loadProgress();
|
||||
if (savedProgress && !reset && !startFrom && !specificStates) {
|
||||
const nextIndex = savedProgress.lastCompletedIndex + 1;
|
||||
if (nextIndex < US_STATES.length) {
|
||||
startFrom = US_STATES[nextIndex];
|
||||
console.log(`[Progress] Resuming from saved progress`);
|
||||
console.log(`[Progress] Last completed: ${savedProgress.lastCompletedState} (${savedProgress.completedStates.length} states done)`);
|
||||
console.log(`[Progress] Started at: ${savedProgress.startedAt}`);
|
||||
console.log(`[Progress] Last update: ${savedProgress.updatedAt}`);
|
||||
console.log('');
|
||||
} else {
|
||||
console.log(`[Progress] All states already completed! Use --reset to start over.`);
|
||||
process.exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
if (startFrom) {
|
||||
const startIndex = statesToProcess.indexOf(startFrom.toUpperCase());
|
||||
if (startIndex === -1) {
|
||||
console.error(`ERROR: State ${startFrom} not found in list`);
|
||||
process.exit(1);
|
||||
}
|
||||
statesToProcess = statesToProcess.slice(startIndex);
|
||||
console.log(`Starting from ${startFrom}, ${statesToProcess.length} states remaining`);
|
||||
}
|
||||
|
||||
// Initialize progress tracking
|
||||
let progress: ProgressData = savedProgress || {
|
||||
lastCompletedState: null,
|
||||
lastCompletedIndex: -1,
|
||||
startedAt: new Date().toISOString(),
|
||||
updatedAt: new Date().toISOString(),
|
||||
completedStates: [],
|
||||
};
|
||||
|
||||
console.log('='.repeat(70));
|
||||
console.log('DUTCHIE ALL-STATES DISCOVERY');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Mode: ${dryRun ? 'DRY RUN' : 'LIVE'}`);
|
||||
console.log(`States to process: ${statesToProcess.length}`);
|
||||
console.log(`States: ${statesToProcess.join(', ')}`);
|
||||
console.log('');
|
||||
|
||||
// Create database pool
|
||||
const connectionString = process.env.DATABASE_URL;
|
||||
if (!connectionString) {
|
||||
console.error('ERROR: DATABASE_URL environment variable is required');
|
||||
process.exit(1);
|
||||
}
|
||||
const pool = new Pool({ connectionString });
|
||||
|
||||
const results: DiscoveryResult[] = [];
|
||||
const startTime = Date.now();
|
||||
|
||||
try {
|
||||
for (let i = 0; i < statesToProcess.length; i++) {
|
||||
const stateCode = statesToProcess[i];
|
||||
|
||||
console.log('');
|
||||
console.log('─'.repeat(70));
|
||||
console.log(`[${i + 1}/${statesToProcess.length}] Discovering ${stateCode}...`);
|
||||
console.log('─'.repeat(70));
|
||||
|
||||
try {
|
||||
const result = await discoverState(pool, stateCode, {
|
||||
dryRun,
|
||||
verbose,
|
||||
cityLimit: 200, // Allow up to 200 cities per state
|
||||
});
|
||||
|
||||
const discoveryResult: DiscoveryResult = {
|
||||
stateCode,
|
||||
citiesCrawled: result.locations.length,
|
||||
locationsFound: result.totalLocationsFound,
|
||||
locationsUpserted: result.totalLocationsUpserted,
|
||||
durationMs: result.durationMs,
|
||||
errors: [],
|
||||
};
|
||||
|
||||
// Collect errors from city results
|
||||
result.locations.forEach((loc) => {
|
||||
if (loc.errors && loc.errors.length > 0) {
|
||||
discoveryResult.errors.push(...loc.errors);
|
||||
}
|
||||
});
|
||||
|
||||
results.push(discoveryResult);
|
||||
|
||||
// Save progress after each successful state
|
||||
const stateIndex = US_STATES.indexOf(stateCode);
|
||||
progress.lastCompletedState = stateCode;
|
||||
progress.lastCompletedIndex = stateIndex;
|
||||
if (!progress.completedStates.includes(stateCode)) {
|
||||
progress.completedStates.push(stateCode);
|
||||
}
|
||||
saveProgress(progress);
|
||||
|
||||
console.log(`\n[${stateCode}] COMPLETE:`);
|
||||
console.log(` Cities crawled: ${discoveryResult.citiesCrawled}`);
|
||||
console.log(` Locations found: ${discoveryResult.locationsFound}`);
|
||||
console.log(` Locations upserted: ${discoveryResult.locationsUpserted}`);
|
||||
console.log(` Duration: ${(discoveryResult.durationMs / 1000).toFixed(1)}s`);
|
||||
console.log(` Progress saved (${progress.completedStates.length}/${US_STATES.length} states)`);
|
||||
|
||||
if (discoveryResult.errors.length > 0) {
|
||||
console.log(` Errors: ${discoveryResult.errors.length}`);
|
||||
}
|
||||
|
||||
// Delay between states to avoid rate limiting
|
||||
if (i < statesToProcess.length - 1) {
|
||||
const delaySeconds = 5;
|
||||
console.log(`\n Waiting ${delaySeconds}s before next state...`);
|
||||
await new Promise((r) => setTimeout(r, delaySeconds * 1000));
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.error(`\n[${stateCode}] ERROR: ${error.message}`);
|
||||
results.push({
|
||||
stateCode,
|
||||
citiesCrawled: 0,
|
||||
locationsFound: 0,
|
||||
locationsUpserted: 0,
|
||||
durationMs: 0,
|
||||
errors: [error.message],
|
||||
});
|
||||
|
||||
// Continue to next state even on error
|
||||
await new Promise((r) => setTimeout(r, 3000));
|
||||
}
|
||||
}
|
||||
|
||||
// Print summary
|
||||
const totalDuration = Date.now() - startTime;
|
||||
const totalLocations = results.reduce((sum, r) => sum + r.locationsFound, 0);
|
||||
const totalUpserted = results.reduce((sum, r) => sum + r.locationsUpserted, 0);
|
||||
const totalCities = results.reduce((sum, r) => sum + r.citiesCrawled, 0);
|
||||
const statesWithErrors = results.filter((r) => r.errors.length > 0);
|
||||
|
||||
console.log('');
|
||||
console.log('='.repeat(70));
|
||||
console.log('DISCOVERY COMPLETE - SUMMARY');
|
||||
console.log('='.repeat(70));
|
||||
console.log(`Total states processed: ${results.length}`);
|
||||
console.log(`Total cities crawled: ${totalCities}`);
|
||||
console.log(`Total locations found: ${totalLocations}`);
|
||||
console.log(`Total locations upserted: ${totalUpserted}`);
|
||||
console.log(`Total duration: ${(totalDuration / 1000 / 60).toFixed(1)} minutes`);
|
||||
console.log('');
|
||||
|
||||
if (statesWithErrors.length > 0) {
|
||||
console.log('States with errors:');
|
||||
statesWithErrors.forEach((r) => {
|
||||
console.log(` ${r.stateCode}: ${r.errors.length} error(s)`);
|
||||
});
|
||||
console.log('');
|
||||
}
|
||||
|
||||
// Print per-state breakdown
|
||||
console.log('Per-state results:');
|
||||
console.log('-'.repeat(70));
|
||||
console.log('State\tCities\tFound\tUpserted\tDuration\tStatus');
|
||||
console.log('-'.repeat(70));
|
||||
|
||||
results.forEach((r) => {
|
||||
const status = r.errors.length > 0 ? 'ERRORS' : 'OK';
|
||||
const duration = (r.durationMs / 1000).toFixed(1) + 's';
|
||||
console.log(
|
||||
`${r.stateCode}\t${r.citiesCrawled}\t${r.locationsFound}\t${r.locationsUpserted}\t\t${duration}\t\t${status}`
|
||||
);
|
||||
});
|
||||
|
||||
// Final count from database
|
||||
console.log('');
|
||||
console.log('='.repeat(70));
|
||||
console.log('DATABASE TOTALS');
|
||||
console.log('='.repeat(70));
|
||||
|
||||
const { rows: locationCounts } = await pool.query(`
|
||||
SELECT
|
||||
state_code,
|
||||
COUNT(*) as count,
|
||||
COUNT(CASE WHEN status = 'discovered' THEN 1 END) as discovered,
|
||||
COUNT(CASE WHEN status = 'promoted' THEN 1 END) as promoted
|
||||
FROM dutchie_discovery_locations
|
||||
WHERE active = TRUE
|
||||
GROUP BY state_code
|
||||
ORDER BY count DESC
|
||||
`);
|
||||
|
||||
console.log('State\tTotal\tDiscovered\tPromoted');
|
||||
console.log('-'.repeat(50));
|
||||
locationCounts.forEach((row: any) => {
|
||||
console.log(`${row.state_code || 'N/A'}\t${row.count}\t${row.discovered}\t\t${row.promoted}`);
|
||||
});
|
||||
|
||||
const { rows: totalRow } = await pool.query(`
|
||||
SELECT COUNT(*) as total FROM dutchie_discovery_locations WHERE active = TRUE
|
||||
`);
|
||||
console.log('-'.repeat(50));
|
||||
console.log(`TOTAL: ${totalRow[0].total} locations in discovery table`);
|
||||
|
||||
const { rows: dispRow } = await pool.query(`
|
||||
SELECT COUNT(*) as total FROM dispensaries WHERE menu_type = 'dutchie'
|
||||
`);
|
||||
console.log(`DISPENSARIES: ${dispRow[0].total} Dutchie dispensaries in main table`);
|
||||
|
||||
// Clear progress file on successful completion of all states
|
||||
if (results.length === US_STATES.length || (savedProgress && progress.completedStates.length === US_STATES.length)) {
|
||||
clearProgress();
|
||||
console.log('\n[Progress] All states completed! Progress file cleared.');
|
||||
}
|
||||
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
main().catch((error) => {
|
||||
console.error('Fatal error:', error);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user