Files
cannaiq/backend/src/scripts/backfill-store-dispensary.ts
Kelly 3861a31a3b Add crawler scheduler, orchestrator, and multi-category intelligence
- Add scheduler UI with store schedules, job queue, and global settings
- Add store crawl orchestrator for intelligent crawl workflow
- Add multi-category intelligence detection (product, specials, brands, metadata)
- Add CrawlerLogger for structured JSON logging
- Add migrations for scheduler tables and dispensary linking
- Add dispensary → scheduler navigation link
- Support production/sandbox crawler modes per provider

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-30 09:29:15 -07:00

346 lines
9.6 KiB
TypeScript

#!/usr/bin/env npx tsx
/**
* Backfill Store-Dispensary Mapping
*
* Links existing stores (scheduler) to dispensaries (master AZDHS directory)
* by matching on name, city, and zip code.
*
* Usage:
* npx tsx src/scripts/backfill-store-dispensary.ts # Preview matches
* npx tsx src/scripts/backfill-store-dispensary.ts --apply # Apply matches
* npx tsx src/scripts/backfill-store-dispensary.ts --verbose # Show all match details
*/
import { pool } from '../db/migrate';
import { logger } from '../services/logger';
const args = process.argv.slice(2);
const flags = {
apply: args.includes('--apply'),
verbose: args.includes('--verbose'),
help: args.includes('--help') || args.includes('-h'),
};
interface Store {
id: number;
name: string;
slug: string;
dispensary_id: number | null;
}
interface Dispensary {
id: number;
name: string;
company_name: string | null;
city: string;
address: string;
slug: string;
}
interface MatchResult {
store: Store;
dispensary: Dispensary | null;
matchType: 'exact_name' | 'normalized_name' | 'company_name' | 'slug' | 'fuzzy' | 'none';
score: number;
}
/**
* Normalize a store/dispensary name for comparison
* Removes common suffixes, punctuation, and extra whitespace
*/
function normalizeName(name: string): string {
return name
.toLowerCase()
.replace(/\s*[-–—]\s*/g, ' ') // Normalize dashes to spaces
.replace(/\s*(dispensary|cannabis|marijuana|weed|shop|store|llc|inc)\s*/gi, ' ')
.replace(/['']/g, "'") // Normalize apostrophes
.replace(/[^\w\s']/g, '') // Remove other punctuation
.replace(/\s+/g, ' ') // Collapse whitespace
.trim();
}
/**
* Simple Levenshtein distance for fuzzy matching
*/
function levenshteinDistance(a: string, b: string): number {
const matrix: number[][] = [];
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // substitution
matrix[i][j - 1] + 1, // insertion
matrix[i - 1][j] + 1 // deletion
);
}
}
}
return matrix[b.length][a.length];
}
/**
* Calculate similarity score (0-100)
*/
function similarityScore(a: string, b: string): number {
const maxLen = Math.max(a.length, b.length);
if (maxLen === 0) return 100;
const distance = levenshteinDistance(a, b);
return Math.round((1 - distance / maxLen) * 100);
}
/**
* Find the best dispensary match for a store
*/
function findBestMatch(store: Store, dispensaries: Dispensary[]): MatchResult {
const normalizedStoreName = normalizeName(store.name);
const storeSlug = store.slug.toLowerCase();
let bestMatch: MatchResult = {
store,
dispensary: null,
matchType: 'none',
score: 0,
};
for (const disp of dispensaries) {
const normalizedDispName = normalizeName(disp.name);
const normalizedCompanyName = disp.company_name ? normalizeName(disp.company_name) : '';
const dispSlug = disp.slug.toLowerCase();
// 1. Exact name match (case-insensitive)
if (store.name.toLowerCase() === disp.name.toLowerCase()) {
return {
store,
dispensary: disp,
matchType: 'exact_name',
score: 100,
};
}
// 2. Normalized name match
if (normalizedStoreName === normalizedDispName) {
return {
store,
dispensary: disp,
matchType: 'normalized_name',
score: 95,
};
}
// 3. Store name matches company name
if (normalizedCompanyName && normalizedStoreName === normalizedCompanyName) {
return {
store,
dispensary: disp,
matchType: 'company_name',
score: 90,
};
}
// 4. Slug match
if (storeSlug === dispSlug) {
return {
store,
dispensary: disp,
matchType: 'slug',
score: 85,
};
}
// 5. Fuzzy matching (only if score > 70)
const nameScore = similarityScore(normalizedStoreName, normalizedDispName);
const companyScore = normalizedCompanyName
? similarityScore(normalizedStoreName, normalizedCompanyName)
: 0;
const fuzzyScore = Math.max(nameScore, companyScore);
if (fuzzyScore > bestMatch.score && fuzzyScore >= 70) {
bestMatch = {
store,
dispensary: disp,
matchType: 'fuzzy',
score: fuzzyScore,
};
}
}
return bestMatch;
}
async function main() {
if (flags.help) {
console.log(`
Backfill Store-Dispensary Mapping
Links existing stores (scheduler) to dispensaries (master AZDHS directory)
by matching on name, company name, or slug similarity.
USAGE:
npx tsx src/scripts/backfill-store-dispensary.ts [OPTIONS]
OPTIONS:
--apply Apply the mappings to the database (default: preview only)
--verbose Show detailed match information for all stores
--help, -h Show this help message
EXAMPLES:
# Preview what would be matched
npx tsx src/scripts/backfill-store-dispensary.ts
# Apply the mappings
npx tsx src/scripts/backfill-store-dispensary.ts --apply
# Show verbose output
npx tsx src/scripts/backfill-store-dispensary.ts --verbose
`);
process.exit(0);
}
console.log('\n📦 Backfill Store-Dispensary Mapping');
console.log('=====================================\n');
try {
// Fetch all stores without a dispensary_id
const storesResult = await pool.query<Store>(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NULL
ORDER BY name
`);
const unmappedStores = storesResult.rows;
// Fetch all already-mapped stores for context
const mappedResult = await pool.query<Store>(`
SELECT id, name, slug, dispensary_id
FROM stores
WHERE dispensary_id IS NOT NULL
ORDER BY name
`);
const mappedStores = mappedResult.rows;
// Fetch all dispensaries
const dispResult = await pool.query<Dispensary>(`
SELECT id, name, company_name, city, address, slug
FROM dispensaries
ORDER BY name
`);
const dispensaries = dispResult.rows;
console.log(`📊 Current Status:`);
console.log(` Stores without dispensary_id: ${unmappedStores.length}`);
console.log(` Stores already mapped: ${mappedStores.length}`);
console.log(` Total dispensaries: ${dispensaries.length}\n`);
if (unmappedStores.length === 0) {
console.log('✅ All stores are already mapped to dispensaries!\n');
await pool.end();
process.exit(0);
}
// Find matches for each unmapped store
const matches: MatchResult[] = [];
const noMatches: Store[] = [];
for (const store of unmappedStores) {
const match = findBestMatch(store, dispensaries);
if (match.dispensary) {
matches.push(match);
} else {
noMatches.push(store);
}
}
// Sort matches by score (highest first)
matches.sort((a, b) => b.score - a.score);
// Display results
console.log(`\n🔗 Matches Found: ${matches.length}`);
console.log('----------------------------------\n');
if (matches.length > 0) {
// Group by match type
const byType: Record<string, MatchResult[]> = {};
for (const m of matches) {
if (!byType[m.matchType]) byType[m.matchType] = [];
byType[m.matchType].push(m);
}
const typeLabels: Record<string, string> = {
exact_name: '✅ Exact Name Match',
normalized_name: '✅ Normalized Name Match',
company_name: '🏢 Company Name Match',
slug: '🔗 Slug Match',
fuzzy: '🔍 Fuzzy Match',
};
for (const [type, results] of Object.entries(byType)) {
console.log(`${typeLabels[type]} (${results.length}):`);
for (const r of results) {
const dispInfo = r.dispensary!;
console.log(` • "${r.store.name}" → "${dispInfo.name}" (${dispInfo.city}) [${r.score}%]`);
}
console.log('');
}
}
if (noMatches.length > 0) {
console.log(`\n❌ No Match Found: ${noMatches.length}`);
console.log('----------------------------------\n');
for (const store of noMatches) {
console.log(` • "${store.name}" (slug: ${store.slug})`);
}
console.log('');
}
// Apply if requested
if (flags.apply && matches.length > 0) {
console.log('\n🔧 Applying mappings...\n');
let updated = 0;
for (const match of matches) {
if (!match.dispensary) continue;
await pool.query(
'UPDATE stores SET dispensary_id = $1 WHERE id = $2',
[match.dispensary.id, match.store.id]
);
updated++;
if (flags.verbose) {
console.log(` ✓ Linked store ${match.store.id} to dispensary ${match.dispensary.id}`);
}
}
console.log(`\n✅ Updated ${updated} stores with dispensary mappings\n`);
logger.info('system', `Backfill complete: linked ${updated} stores to dispensaries`);
} else if (matches.length > 0 && !flags.apply) {
console.log('\n💡 Run with --apply to update the database\n');
}
// Summary
console.log('📈 Summary:');
console.log(` Would match: ${matches.length} stores`);
console.log(` No match: ${noMatches.length} stores`);
console.log(` Match rate: ${Math.round((matches.length / unmappedStores.length) * 100)}%\n`);
} catch (error) {
console.error('Error:', error);
process.exit(1);
} finally {
await pool.end();
}
}
main().catch(console.error);