The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
381 lines
18 KiB
JavaScript
381 lines
18 KiB
JavaScript
"use strict";
|
|
/**
|
|
* Directory-Based Store Matcher
|
|
*
|
|
* Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists,
|
|
* then matches them to existing dispensaries by fuzzy name/city/address matching.
|
|
*
|
|
* This allows us to:
|
|
* 1. Find specific store URLs for directory-style websites
|
|
* 2. Match stores confidently by name+city
|
|
* 3. Mark non-Dutchie providers as not_crawlable until we build crawlers
|
|
*/
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.scrapeSolDirectory = scrapeSolDirectory;
|
|
exports.scrapeCuraleafDirectory = scrapeCuraleafDirectory;
|
|
exports.matchDirectoryToDispensaries = matchDirectoryToDispensaries;
|
|
exports.previewDirectoryMatches = previewDirectoryMatches;
|
|
exports.applyHighConfidenceMatches = applyHighConfidenceMatches;
|
|
const connection_1 = require("../db/connection");
|
|
// ============================================================
|
|
// NORMALIZATION FUNCTIONS
|
|
// ============================================================
|
|
/**
|
|
* Normalize a string for comparison:
|
|
* - Lowercase
|
|
* - Remove common suffixes (dispensary, cannabis, etc.)
|
|
* - Remove punctuation
|
|
* - Collapse whitespace
|
|
*/
|
|
function normalizeForComparison(str) {
|
|
if (!str)
|
|
return '';
|
|
return str
|
|
.toLowerCase()
|
|
.replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ')
|
|
.replace(/[^\w\s]/g, ' ') // Remove punctuation
|
|
.replace(/\s+/g, ' ') // Collapse whitespace
|
|
.trim();
|
|
}
|
|
/**
|
|
* Normalize city name for comparison
|
|
*/
|
|
function normalizeCity(city) {
|
|
if (!city)
|
|
return '';
|
|
return city
|
|
.toLowerCase()
|
|
.replace(/[^\w\s]/g, '')
|
|
.trim();
|
|
}
|
|
/**
|
|
* Calculate similarity between two strings (0-1)
|
|
* Uses Levenshtein distance normalized by max length
|
|
*/
|
|
function stringSimilarity(a, b) {
|
|
if (!a || !b)
|
|
return 0;
|
|
if (a === b)
|
|
return 1;
|
|
const longer = a.length > b.length ? a : b;
|
|
const shorter = a.length > b.length ? b : a;
|
|
if (longer.length === 0)
|
|
return 1;
|
|
const distance = levenshteinDistance(longer, shorter);
|
|
return (longer.length - distance) / longer.length;
|
|
}
|
|
/**
|
|
* Levenshtein distance between two strings
|
|
*/
|
|
function levenshteinDistance(a, b) {
|
|
const matrix = [];
|
|
for (let i = 0; i <= b.length; i++) {
|
|
matrix[i] = [i];
|
|
}
|
|
for (let j = 0; j <= a.length; j++) {
|
|
matrix[0][j] = j;
|
|
}
|
|
for (let i = 1; i <= b.length; i++) {
|
|
for (let j = 1; j <= a.length; j++) {
|
|
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
|
matrix[i][j] = matrix[i - 1][j - 1];
|
|
}
|
|
else {
|
|
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution
|
|
matrix[i][j - 1] + 1, // insertion
|
|
matrix[i - 1][j] + 1 // deletion
|
|
);
|
|
}
|
|
}
|
|
}
|
|
return matrix[b.length][a.length];
|
|
}
|
|
/**
|
|
* Check if string contains another (with normalization)
|
|
*/
|
|
function containsNormalized(haystack, needle) {
|
|
return normalizeForComparison(haystack).includes(normalizeForComparison(needle));
|
|
}
|
|
// ============================================================
|
|
// PROVIDER DIRECTORY SCRAPERS
|
|
// ============================================================
|
|
/**
|
|
* Sol Flower (livewithsol.com) - Static HTML, easy to scrape
|
|
*/
|
|
async function scrapeSolDirectory() {
|
|
console.log('[DirectoryMatcher] Scraping Sol Flower directory...');
|
|
try {
|
|
const response = await fetch('https://www.livewithsol.com/locations/', {
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
Accept: 'text/html',
|
|
},
|
|
});
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}`);
|
|
}
|
|
const html = await response.text();
|
|
// Extract store entries from HTML
|
|
// Sol's structure: Each location has name, address in specific divs
|
|
const stores = [];
|
|
// Pattern to find location cards
|
|
// Format: <a href="/locations/slug/">NAME</a> with address nearby
|
|
const locationRegex = /<a[^>]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi;
|
|
let match;
|
|
while ((match = locationRegex.exec(html)) !== null) {
|
|
const [, path, name, address] = match;
|
|
// Extract city from common Arizona cities
|
|
let city = 'Unknown';
|
|
const cityPatterns = [
|
|
{ pattern: /phoenix/i, city: 'Phoenix' },
|
|
{ pattern: /scottsdale/i, city: 'Scottsdale' },
|
|
{ pattern: /tempe/i, city: 'Tempe' },
|
|
{ pattern: /tucson/i, city: 'Tucson' },
|
|
{ pattern: /mesa/i, city: 'Mesa' },
|
|
{ pattern: /sun city/i, city: 'Sun City' },
|
|
{ pattern: /glendale/i, city: 'Glendale' },
|
|
];
|
|
for (const { pattern, city: cityName } of cityPatterns) {
|
|
if (pattern.test(name) || pattern.test(address)) {
|
|
city = cityName;
|
|
break;
|
|
}
|
|
}
|
|
stores.push({
|
|
name: name.trim(),
|
|
city,
|
|
state: 'AZ',
|
|
address: address.trim(),
|
|
storeUrl: `https://www.livewithsol.com${path}`,
|
|
});
|
|
}
|
|
// If regex didn't work, use known hardcoded values (fallback)
|
|
if (stores.length === 0) {
|
|
console.log('[DirectoryMatcher] Using hardcoded Sol locations');
|
|
return [
|
|
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
|
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
|
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
|
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
|
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
|
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
|
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
|
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
|
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
|
];
|
|
}
|
|
console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`);
|
|
return stores;
|
|
}
|
|
catch (error) {
|
|
console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message);
|
|
// Return hardcoded fallback
|
|
return [
|
|
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
|
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
|
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
|
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
|
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
|
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
|
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
|
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
|
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
|
];
|
|
}
|
|
}
|
|
/**
|
|
* Curaleaf - Has age-gate, so we need hardcoded AZ locations
|
|
* In production, this would use Playwright to bypass age-gate
|
|
*/
|
|
async function scrapeCuraleafDirectory() {
|
|
console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...');
|
|
// Hardcoded Arizona Curaleaf locations from public knowledge
|
|
// These would be scraped via Playwright in production
|
|
return [
|
|
{ name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' },
|
|
{ name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' },
|
|
{ name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' },
|
|
{ name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' },
|
|
{ name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' },
|
|
{ name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' },
|
|
{ name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' },
|
|
{ name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' },
|
|
{ name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' },
|
|
{ name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' },
|
|
{ name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' },
|
|
{ name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' },
|
|
];
|
|
}
|
|
/**
|
|
* Match a directory store to an existing dispensary
|
|
*/
|
|
function matchStoreToDispensary(store, dispensaries) {
|
|
const normalizedStoreName = normalizeForComparison(store.name);
|
|
const normalizedStoreCity = normalizeCity(store.city);
|
|
let bestMatch = null;
|
|
let bestScore = 0;
|
|
let matchReason = '';
|
|
for (const disp of dispensaries) {
|
|
const normalizedDispName = normalizeForComparison(disp.name);
|
|
const normalizedDispCity = normalizeCity(disp.city || '');
|
|
let score = 0;
|
|
const reasons = [];
|
|
// 1. Name similarity (max 50 points)
|
|
const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName);
|
|
score += nameSimilarity * 50;
|
|
if (nameSimilarity > 0.8)
|
|
reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`);
|
|
// 2. City match (25 points for exact, 15 for partial)
|
|
if (normalizedStoreCity && normalizedDispCity) {
|
|
if (normalizedStoreCity === normalizedDispCity) {
|
|
score += 25;
|
|
reasons.push('city_exact');
|
|
}
|
|
else if (normalizedStoreCity.includes(normalizedDispCity) ||
|
|
normalizedDispCity.includes(normalizedStoreCity)) {
|
|
score += 15;
|
|
reasons.push('city_partial');
|
|
}
|
|
}
|
|
// 3. Address contains street name (15 points)
|
|
if (store.address && disp.address) {
|
|
const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
|
const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
|
if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) {
|
|
score += 15;
|
|
reasons.push('address_match');
|
|
}
|
|
}
|
|
// 4. Brand name in dispensary name (10 points)
|
|
const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol"
|
|
if (disp.name.toLowerCase().includes(brandName)) {
|
|
score += 10;
|
|
reasons.push('brand_match');
|
|
}
|
|
if (score > bestScore) {
|
|
bestScore = score;
|
|
bestMatch = disp;
|
|
matchReason = reasons.join(', ');
|
|
}
|
|
}
|
|
// Determine confidence level
|
|
let confidence;
|
|
if (bestScore >= 70) {
|
|
confidence = 'high';
|
|
}
|
|
else if (bestScore >= 50) {
|
|
confidence = 'medium';
|
|
}
|
|
else if (bestScore >= 30) {
|
|
confidence = 'low';
|
|
}
|
|
else {
|
|
confidence = 'none';
|
|
}
|
|
return {
|
|
directoryStore: store,
|
|
dispensaryId: bestMatch?.id || null,
|
|
dispensaryName: bestMatch?.name || null,
|
|
confidence,
|
|
matchReason: matchReason || 'no_match',
|
|
};
|
|
}
|
|
// ============================================================
|
|
// MAIN FUNCTIONS
|
|
// ============================================================
|
|
/**
|
|
* Run directory matching for a provider and update database
|
|
* Only applies high-confidence matches automatically
|
|
*/
|
|
async function matchDirectoryToDispensaries(provider, dryRun = true) {
|
|
console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`);
|
|
// Get directory stores
|
|
let directoryStores;
|
|
if (provider === 'curaleaf') {
|
|
directoryStores = await scrapeCuraleafDirectory();
|
|
}
|
|
else if (provider === 'sol') {
|
|
directoryStores = await scrapeSolDirectory();
|
|
}
|
|
else {
|
|
throw new Error(`Unknown provider: ${provider}`);
|
|
}
|
|
// Get all AZ dispensaries from database
|
|
const { rows: dispensaries } = await (0, connection_1.query)(`SELECT id, name, city, state, address, menu_type, menu_url, website
|
|
FROM dispensaries
|
|
WHERE state = 'AZ'`);
|
|
console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`);
|
|
// Match each directory store
|
|
const results = [];
|
|
for (const store of directoryStores) {
|
|
const match = matchStoreToDispensary(store, dispensaries);
|
|
results.push(match);
|
|
// Only apply high-confidence matches if not dry run
|
|
if (!dryRun && match.confidence === 'high' && match.dispensaryId) {
|
|
await applyDirectoryMatch(match.dispensaryId, provider, store);
|
|
}
|
|
}
|
|
// Count results
|
|
const report = {
|
|
provider,
|
|
totalDirectoryStores: directoryStores.length,
|
|
highConfidenceMatches: results.filter((r) => r.confidence === 'high').length,
|
|
mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length,
|
|
lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length,
|
|
unmatched: results.filter((r) => r.confidence === 'none').length,
|
|
results,
|
|
};
|
|
console.log(`[DirectoryMatcher] ${provider} matching complete:`);
|
|
console.log(` - High confidence: ${report.highConfidenceMatches}`);
|
|
console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`);
|
|
console.log(` - Low confidence: ${report.lowConfidenceMatches}`);
|
|
console.log(` - Unmatched: ${report.unmatched}`);
|
|
return report;
|
|
}
|
|
/**
|
|
* Apply a directory match to a dispensary
|
|
*/
|
|
async function applyDirectoryMatch(dispensaryId, provider, store) {
|
|
console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`);
|
|
await (0, connection_1.query)(`
|
|
UPDATE dispensaries SET
|
|
menu_type = $1,
|
|
menu_url = $2,
|
|
platform_dispensary_id = NULL,
|
|
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
|
jsonb_build_object(
|
|
'detected_provider', $1::text,
|
|
'detection_method', 'directory_match'::text,
|
|
'detected_at', NOW(),
|
|
'directory_store_name', $3::text,
|
|
'directory_store_url', $2::text,
|
|
'directory_store_city', $4::text,
|
|
'directory_store_address', $5::text,
|
|
'not_crawlable', true,
|
|
'not_crawlable_reason', $6::text
|
|
),
|
|
updated_at = NOW()
|
|
WHERE id = $7
|
|
`, [
|
|
provider,
|
|
store.storeUrl,
|
|
store.name,
|
|
store.city,
|
|
store.address,
|
|
`${provider} proprietary menu - no crawler available`,
|
|
dispensaryId,
|
|
]);
|
|
}
|
|
/**
|
|
* Preview matches without applying them
|
|
*/
|
|
async function previewDirectoryMatches(provider) {
|
|
return matchDirectoryToDispensaries(provider, true);
|
|
}
|
|
/**
|
|
* Apply high-confidence matches
|
|
*/
|
|
async function applyHighConfidenceMatches(provider) {
|
|
return matchDirectoryToDispensaries(provider, false);
|
|
}
|