"use strict"; /** * Directory-Based Store Matcher * * Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists, * then matches them to existing dispensaries by fuzzy name/city/address matching. * * This allows us to: * 1. Find specific store URLs for directory-style websites * 2. Match stores confidently by name+city * 3. Mark non-Dutchie providers as not_crawlable until we build crawlers */ Object.defineProperty(exports, "__esModule", { value: true }); exports.scrapeSolDirectory = scrapeSolDirectory; exports.scrapeCuraleafDirectory = scrapeCuraleafDirectory; exports.matchDirectoryToDispensaries = matchDirectoryToDispensaries; exports.previewDirectoryMatches = previewDirectoryMatches; exports.applyHighConfidenceMatches = applyHighConfidenceMatches; const connection_1 = require("../db/connection"); // ============================================================ // NORMALIZATION FUNCTIONS // ============================================================ /** * Normalize a string for comparison: * - Lowercase * - Remove common suffixes (dispensary, cannabis, etc.) * - Remove punctuation * - Collapse whitespace */ function normalizeForComparison(str) { if (!str) return ''; return str .toLowerCase() .replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ') .replace(/[^\w\s]/g, ' ') // Remove punctuation .replace(/\s+/g, ' ') // Collapse whitespace .trim(); } /** * Normalize city name for comparison */ function normalizeCity(city) { if (!city) return ''; return city .toLowerCase() .replace(/[^\w\s]/g, '') .trim(); } /** * Calculate similarity between two strings (0-1) * Uses Levenshtein distance normalized by max length */ function stringSimilarity(a, b) { if (!a || !b) return 0; if (a === b) return 1; const longer = a.length > b.length ? a : b; const shorter = a.length > b.length ? b : a; if (longer.length === 0) return 1; const distance = levenshteinDistance(longer, shorter); return (longer.length - distance) / longer.length; } /** * Levenshtein distance between two strings */ function levenshteinDistance(a, b) { const matrix = []; for (let i = 0; i <= b.length; i++) { matrix[i] = [i]; } for (let j = 0; j <= a.length; j++) { matrix[0][j] = j; } for (let i = 1; i <= b.length; i++) { for (let j = 1; j <= a.length; j++) { if (b.charAt(i - 1) === a.charAt(j - 1)) { matrix[i][j] = matrix[i - 1][j - 1]; } else { matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution matrix[i][j - 1] + 1, // insertion matrix[i - 1][j] + 1 // deletion ); } } } return matrix[b.length][a.length]; } /** * Check if string contains another (with normalization) */ function containsNormalized(haystack, needle) { return normalizeForComparison(haystack).includes(normalizeForComparison(needle)); } // ============================================================ // PROVIDER DIRECTORY SCRAPERS // ============================================================ /** * Sol Flower (livewithsol.com) - Static HTML, easy to scrape */ async function scrapeSolDirectory() { console.log('[DirectoryMatcher] Scraping Sol Flower directory...'); try { const response = await fetch('https://www.livewithsol.com/locations/', { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', Accept: 'text/html', }, }); if (!response.ok) { throw new Error(`HTTP ${response.status}`); } const html = await response.text(); // Extract store entries from HTML // Sol's structure: Each location has name, address in specific divs const stores = []; // Pattern to find location cards // Format: NAME with address nearby const locationRegex = /]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi; let match; while ((match = locationRegex.exec(html)) !== null) { const [, path, name, address] = match; // Extract city from common Arizona cities let city = 'Unknown'; const cityPatterns = [ { pattern: /phoenix/i, city: 'Phoenix' }, { pattern: /scottsdale/i, city: 'Scottsdale' }, { pattern: /tempe/i, city: 'Tempe' }, { pattern: /tucson/i, city: 'Tucson' }, { pattern: /mesa/i, city: 'Mesa' }, { pattern: /sun city/i, city: 'Sun City' }, { pattern: /glendale/i, city: 'Glendale' }, ]; for (const { pattern, city: cityName } of cityPatterns) { if (pattern.test(name) || pattern.test(address)) { city = cityName; break; } } stores.push({ name: name.trim(), city, state: 'AZ', address: address.trim(), storeUrl: `https://www.livewithsol.com${path}`, }); } // If regex didn't work, use known hardcoded values (fallback) if (stores.length === 0) { console.log('[DirectoryMatcher] Using hardcoded Sol locations'); return [ { name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' }, { name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' }, { name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' }, { name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' }, { name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' }, { name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' }, { name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' }, { name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' }, { name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' }, ]; } console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`); return stores; } catch (error) { console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message); // Return hardcoded fallback return [ { name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' }, { name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' }, { name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' }, { name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' }, { name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' }, { name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' }, { name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' }, { name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' }, { name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' }, ]; } } /** * Curaleaf - Has age-gate, so we need hardcoded AZ locations * In production, this would use Playwright to bypass age-gate */ async function scrapeCuraleafDirectory() { console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...'); // Hardcoded Arizona Curaleaf locations from public knowledge // These would be scraped via Playwright in production return [ { name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' }, { name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' }, { name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' }, { name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' }, { name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' }, { name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' }, { name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' }, { name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' }, { name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' }, { name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' }, { name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' }, { name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' }, ]; } /** * Match a directory store to an existing dispensary */ function matchStoreToDispensary(store, dispensaries) { const normalizedStoreName = normalizeForComparison(store.name); const normalizedStoreCity = normalizeCity(store.city); let bestMatch = null; let bestScore = 0; let matchReason = ''; for (const disp of dispensaries) { const normalizedDispName = normalizeForComparison(disp.name); const normalizedDispCity = normalizeCity(disp.city || ''); let score = 0; const reasons = []; // 1. Name similarity (max 50 points) const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName); score += nameSimilarity * 50; if (nameSimilarity > 0.8) reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`); // 2. City match (25 points for exact, 15 for partial) if (normalizedStoreCity && normalizedDispCity) { if (normalizedStoreCity === normalizedDispCity) { score += 25; reasons.push('city_exact'); } else if (normalizedStoreCity.includes(normalizedDispCity) || normalizedDispCity.includes(normalizedStoreCity)) { score += 15; reasons.push('city_partial'); } } // 3. Address contains street name (15 points) if (store.address && disp.address) { const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' '); const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' '); if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) { score += 15; reasons.push('address_match'); } } // 4. Brand name in dispensary name (10 points) const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol" if (disp.name.toLowerCase().includes(brandName)) { score += 10; reasons.push('brand_match'); } if (score > bestScore) { bestScore = score; bestMatch = disp; matchReason = reasons.join(', '); } } // Determine confidence level let confidence; if (bestScore >= 70) { confidence = 'high'; } else if (bestScore >= 50) { confidence = 'medium'; } else if (bestScore >= 30) { confidence = 'low'; } else { confidence = 'none'; } return { directoryStore: store, dispensaryId: bestMatch?.id || null, dispensaryName: bestMatch?.name || null, confidence, matchReason: matchReason || 'no_match', }; } // ============================================================ // MAIN FUNCTIONS // ============================================================ /** * Run directory matching for a provider and update database * Only applies high-confidence matches automatically */ async function matchDirectoryToDispensaries(provider, dryRun = true) { console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`); // Get directory stores let directoryStores; if (provider === 'curaleaf') { directoryStores = await scrapeCuraleafDirectory(); } else if (provider === 'sol') { directoryStores = await scrapeSolDirectory(); } else { throw new Error(`Unknown provider: ${provider}`); } // Get all AZ dispensaries from database const { rows: dispensaries } = await (0, connection_1.query)(`SELECT id, name, city, state, address, menu_type, menu_url, website FROM dispensaries WHERE state = 'AZ'`); console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`); // Match each directory store const results = []; for (const store of directoryStores) { const match = matchStoreToDispensary(store, dispensaries); results.push(match); // Only apply high-confidence matches if not dry run if (!dryRun && match.confidence === 'high' && match.dispensaryId) { await applyDirectoryMatch(match.dispensaryId, provider, store); } } // Count results const report = { provider, totalDirectoryStores: directoryStores.length, highConfidenceMatches: results.filter((r) => r.confidence === 'high').length, mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length, lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length, unmatched: results.filter((r) => r.confidence === 'none').length, results, }; console.log(`[DirectoryMatcher] ${provider} matching complete:`); console.log(` - High confidence: ${report.highConfidenceMatches}`); console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`); console.log(` - Low confidence: ${report.lowConfidenceMatches}`); console.log(` - Unmatched: ${report.unmatched}`); return report; } /** * Apply a directory match to a dispensary */ async function applyDirectoryMatch(dispensaryId, provider, store) { console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`); await (0, connection_1.query)(` UPDATE dispensaries SET menu_type = $1, menu_url = $2, platform_dispensary_id = NULL, provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) || jsonb_build_object( 'detected_provider', $1::text, 'detection_method', 'directory_match'::text, 'detected_at', NOW(), 'directory_store_name', $3::text, 'directory_store_url', $2::text, 'directory_store_city', $4::text, 'directory_store_address', $5::text, 'not_crawlable', true, 'not_crawlable_reason', $6::text ), updated_at = NOW() WHERE id = $7 `, [ provider, store.storeUrl, store.name, store.city, store.address, `${provider} proprietary menu - no crawler available`, dispensaryId, ]); } /** * Preview matches without applying them */ async function previewDirectoryMatches(provider) { return matchDirectoryToDispensaries(provider, true); } /** * Apply high-confidence matches */ async function applyHighConfidenceMatches(provider) { return matchDirectoryToDispensaries(provider, false); }