fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
380
backend/dist/dutchie-az/services/directory-matcher.js
vendored
Normal file
380
backend/dist/dutchie-az/services/directory-matcher.js
vendored
Normal file
@@ -0,0 +1,380 @@
|
||||
"use strict";
|
||||
/**
|
||||
* Directory-Based Store Matcher
|
||||
*
|
||||
* Scrapes provider directory pages (Curaleaf, Sol, etc.) to get store lists,
|
||||
* then matches them to existing dispensaries by fuzzy name/city/address matching.
|
||||
*
|
||||
* This allows us to:
|
||||
* 1. Find specific store URLs for directory-style websites
|
||||
* 2. Match stores confidently by name+city
|
||||
* 3. Mark non-Dutchie providers as not_crawlable until we build crawlers
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
exports.scrapeSolDirectory = scrapeSolDirectory;
|
||||
exports.scrapeCuraleafDirectory = scrapeCuraleafDirectory;
|
||||
exports.matchDirectoryToDispensaries = matchDirectoryToDispensaries;
|
||||
exports.previewDirectoryMatches = previewDirectoryMatches;
|
||||
exports.applyHighConfidenceMatches = applyHighConfidenceMatches;
|
||||
const connection_1 = require("../db/connection");
|
||||
// ============================================================
|
||||
// NORMALIZATION FUNCTIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Normalize a string for comparison:
|
||||
* - Lowercase
|
||||
* - Remove common suffixes (dispensary, cannabis, etc.)
|
||||
* - Remove punctuation
|
||||
* - Collapse whitespace
|
||||
*/
|
||||
function normalizeForComparison(str) {
|
||||
if (!str)
|
||||
return '';
|
||||
return str
|
||||
.toLowerCase()
|
||||
.replace(/\s+(dispensary|cannabis|marijuana|medical|recreational|shop|store|flower|wellness)(\s|$)/gi, ' ')
|
||||
.replace(/[^\w\s]/g, ' ') // Remove punctuation
|
||||
.replace(/\s+/g, ' ') // Collapse whitespace
|
||||
.trim();
|
||||
}
|
||||
/**
|
||||
* Normalize city name for comparison
|
||||
*/
|
||||
function normalizeCity(city) {
|
||||
if (!city)
|
||||
return '';
|
||||
return city
|
||||
.toLowerCase()
|
||||
.replace(/[^\w\s]/g, '')
|
||||
.trim();
|
||||
}
|
||||
/**
|
||||
* Calculate similarity between two strings (0-1)
|
||||
* Uses Levenshtein distance normalized by max length
|
||||
*/
|
||||
function stringSimilarity(a, b) {
|
||||
if (!a || !b)
|
||||
return 0;
|
||||
if (a === b)
|
||||
return 1;
|
||||
const longer = a.length > b.length ? a : b;
|
||||
const shorter = a.length > b.length ? b : a;
|
||||
if (longer.length === 0)
|
||||
return 1;
|
||||
const distance = levenshteinDistance(longer, shorter);
|
||||
return (longer.length - distance) / longer.length;
|
||||
}
|
||||
/**
|
||||
* Levenshtein distance between two strings
|
||||
*/
|
||||
function levenshteinDistance(a, b) {
|
||||
const matrix = [];
|
||||
for (let i = 0; i <= b.length; i++) {
|
||||
matrix[i] = [i];
|
||||
}
|
||||
for (let j = 0; j <= a.length; j++) {
|
||||
matrix[0][j] = j;
|
||||
}
|
||||
for (let i = 1; i <= b.length; i++) {
|
||||
for (let j = 1; j <= a.length; j++) {
|
||||
if (b.charAt(i - 1) === a.charAt(j - 1)) {
|
||||
matrix[i][j] = matrix[i - 1][j - 1];
|
||||
}
|
||||
else {
|
||||
matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, // substitution
|
||||
matrix[i][j - 1] + 1, // insertion
|
||||
matrix[i - 1][j] + 1 // deletion
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
return matrix[b.length][a.length];
|
||||
}
|
||||
/**
|
||||
* Check if string contains another (with normalization)
|
||||
*/
|
||||
function containsNormalized(haystack, needle) {
|
||||
return normalizeForComparison(haystack).includes(normalizeForComparison(needle));
|
||||
}
|
||||
// ============================================================
|
||||
// PROVIDER DIRECTORY SCRAPERS
|
||||
// ============================================================
|
||||
/**
|
||||
* Sol Flower (livewithsol.com) - Static HTML, easy to scrape
|
||||
*/
|
||||
async function scrapeSolDirectory() {
|
||||
console.log('[DirectoryMatcher] Scraping Sol Flower directory...');
|
||||
try {
|
||||
const response = await fetch('https://www.livewithsol.com/locations/', {
|
||||
headers: {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
Accept: 'text/html',
|
||||
},
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`HTTP ${response.status}`);
|
||||
}
|
||||
const html = await response.text();
|
||||
// Extract store entries from HTML
|
||||
// Sol's structure: Each location has name, address in specific divs
|
||||
const stores = [];
|
||||
// Pattern to find location cards
|
||||
// Format: <a href="/locations/slug/">NAME</a> with address nearby
|
||||
const locationRegex = /<a[^>]+href="(\/locations\/[^"]+)"[^>]*>([^<]+)<\/a>[\s\S]*?(\d+[^<]+(?:Ave|St|Blvd|Dr|Rd|Way)[^<]*)/gi;
|
||||
let match;
|
||||
while ((match = locationRegex.exec(html)) !== null) {
|
||||
const [, path, name, address] = match;
|
||||
// Extract city from common Arizona cities
|
||||
let city = 'Unknown';
|
||||
const cityPatterns = [
|
||||
{ pattern: /phoenix/i, city: 'Phoenix' },
|
||||
{ pattern: /scottsdale/i, city: 'Scottsdale' },
|
||||
{ pattern: /tempe/i, city: 'Tempe' },
|
||||
{ pattern: /tucson/i, city: 'Tucson' },
|
||||
{ pattern: /mesa/i, city: 'Mesa' },
|
||||
{ pattern: /sun city/i, city: 'Sun City' },
|
||||
{ pattern: /glendale/i, city: 'Glendale' },
|
||||
];
|
||||
for (const { pattern, city: cityName } of cityPatterns) {
|
||||
if (pattern.test(name) || pattern.test(address)) {
|
||||
city = cityName;
|
||||
break;
|
||||
}
|
||||
}
|
||||
stores.push({
|
||||
name: name.trim(),
|
||||
city,
|
||||
state: 'AZ',
|
||||
address: address.trim(),
|
||||
storeUrl: `https://www.livewithsol.com${path}`,
|
||||
});
|
||||
}
|
||||
// If regex didn't work, use known hardcoded values (fallback)
|
||||
if (stores.length === 0) {
|
||||
console.log('[DirectoryMatcher] Using hardcoded Sol locations');
|
||||
return [
|
||||
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
||||
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
||||
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
||||
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
||||
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
||||
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
||||
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
||||
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
||||
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
||||
];
|
||||
}
|
||||
console.log(`[DirectoryMatcher] Found ${stores.length} Sol Flower locations`);
|
||||
return stores;
|
||||
}
|
||||
catch (error) {
|
||||
console.error('[DirectoryMatcher] Error scraping Sol directory:', error.message);
|
||||
// Return hardcoded fallback
|
||||
return [
|
||||
{ name: 'Sol Flower 32nd & Shea', city: 'Phoenix', state: 'AZ', address: '3217 E Shea Blvd Suite 1 A', storeUrl: 'https://www.livewithsol.com/locations/deer-valley/' },
|
||||
{ name: 'Sol Flower Scottsdale Airpark', city: 'Scottsdale', state: 'AZ', address: '14980 N 78th Way Ste 204', storeUrl: 'https://www.livewithsol.com/locations/scottsdale-airpark/' },
|
||||
{ name: 'Sol Flower Sun City', city: 'Sun City', state: 'AZ', address: '13650 N 99th Ave', storeUrl: 'https://www.livewithsol.com/locations/sun-city/' },
|
||||
{ name: 'Sol Flower Tempe McClintock', city: 'Tempe', state: 'AZ', address: '1322 N McClintock Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-mcclintock/' },
|
||||
{ name: 'Sol Flower Tempe University', city: 'Tempe', state: 'AZ', address: '2424 W University Dr', storeUrl: 'https://www.livewithsol.com/locations/tempe-university/' },
|
||||
{ name: 'Sol Flower Foothills Tucson', city: 'Tucson', state: 'AZ', address: '6026 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/foothills-tucson/' },
|
||||
{ name: 'Sol Flower South Tucson', city: 'Tucson', state: 'AZ', address: '3000 W Valencia Rd Ste 210', storeUrl: 'https://www.livewithsol.com/locations/south-tucson/' },
|
||||
{ name: 'Sol Flower North Tucson', city: 'Tucson', state: 'AZ', address: '4837 N 1st Ave', storeUrl: 'https://www.livewithsol.com/locations/north-tucson/' },
|
||||
{ name: 'Sol Flower Casas Adobes', city: 'Tucson', state: 'AZ', address: '6437 N Oracle Rd', storeUrl: 'https://www.livewithsol.com/locations/casas-adobes/' },
|
||||
];
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Curaleaf - Has age-gate, so we need hardcoded AZ locations
|
||||
* In production, this would use Playwright to bypass age-gate
|
||||
*/
|
||||
async function scrapeCuraleafDirectory() {
|
||||
console.log('[DirectoryMatcher] Using hardcoded Curaleaf AZ locations (age-gate blocks simple fetch)...');
|
||||
// Hardcoded Arizona Curaleaf locations from public knowledge
|
||||
// These would be scraped via Playwright in production
|
||||
return [
|
||||
{ name: 'Curaleaf Phoenix Camelback', city: 'Phoenix', state: 'AZ', address: '4811 E Camelback Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-camelback' },
|
||||
{ name: 'Curaleaf Phoenix Midtown', city: 'Phoenix', state: 'AZ', address: '1928 E Highland Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-phoenix-midtown' },
|
||||
{ name: 'Curaleaf Glendale East', city: 'Glendale', state: 'AZ', address: '5150 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-east' },
|
||||
{ name: 'Curaleaf Glendale West', city: 'Glendale', state: 'AZ', address: '6501 W Glendale Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-glendale-west' },
|
||||
{ name: 'Curaleaf Gilbert', city: 'Gilbert', state: 'AZ', address: '1736 E Williams Field Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-gilbert' },
|
||||
{ name: 'Curaleaf Mesa', city: 'Mesa', state: 'AZ', address: '1540 S Power Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-mesa' },
|
||||
{ name: 'Curaleaf Tempe', city: 'Tempe', state: 'AZ', address: '1815 E Broadway Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tempe' },
|
||||
{ name: 'Curaleaf Scottsdale', city: 'Scottsdale', state: 'AZ', address: '8904 E Indian Bend Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-scottsdale' },
|
||||
{ name: 'Curaleaf Tucson Prince', city: 'Tucson', state: 'AZ', address: '3955 W Prince Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-prince' },
|
||||
{ name: 'Curaleaf Tucson Midvale', city: 'Tucson', state: 'AZ', address: '2936 N Midvale Park Rd', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-tucson-midvale' },
|
||||
{ name: 'Curaleaf Sedona', city: 'Sedona', state: 'AZ', address: '525 AZ-179', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-sedona' },
|
||||
{ name: 'Curaleaf Youngtown', city: 'Youngtown', state: 'AZ', address: '11125 W Grand Ave', storeUrl: 'https://curaleaf.com/stores/curaleaf-az-youngtown' },
|
||||
];
|
||||
}
|
||||
/**
|
||||
* Match a directory store to an existing dispensary
|
||||
*/
|
||||
function matchStoreToDispensary(store, dispensaries) {
|
||||
const normalizedStoreName = normalizeForComparison(store.name);
|
||||
const normalizedStoreCity = normalizeCity(store.city);
|
||||
let bestMatch = null;
|
||||
let bestScore = 0;
|
||||
let matchReason = '';
|
||||
for (const disp of dispensaries) {
|
||||
const normalizedDispName = normalizeForComparison(disp.name);
|
||||
const normalizedDispCity = normalizeCity(disp.city || '');
|
||||
let score = 0;
|
||||
const reasons = [];
|
||||
// 1. Name similarity (max 50 points)
|
||||
const nameSimilarity = stringSimilarity(normalizedStoreName, normalizedDispName);
|
||||
score += nameSimilarity * 50;
|
||||
if (nameSimilarity > 0.8)
|
||||
reasons.push(`name_match(${(nameSimilarity * 100).toFixed(0)}%)`);
|
||||
// 2. City match (25 points for exact, 15 for partial)
|
||||
if (normalizedStoreCity && normalizedDispCity) {
|
||||
if (normalizedStoreCity === normalizedDispCity) {
|
||||
score += 25;
|
||||
reasons.push('city_exact');
|
||||
}
|
||||
else if (normalizedStoreCity.includes(normalizedDispCity) ||
|
||||
normalizedDispCity.includes(normalizedStoreCity)) {
|
||||
score += 15;
|
||||
reasons.push('city_partial');
|
||||
}
|
||||
}
|
||||
// 3. Address contains street name (15 points)
|
||||
if (store.address && disp.address) {
|
||||
const storeStreet = store.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
||||
const dispStreet = disp.address.toLowerCase().split(/\s+/).slice(1, 4).join(' ');
|
||||
if (storeStreet && dispStreet && stringSimilarity(storeStreet, dispStreet) > 0.7) {
|
||||
score += 15;
|
||||
reasons.push('address_match');
|
||||
}
|
||||
}
|
||||
// 4. Brand name in dispensary name (10 points)
|
||||
const brandName = store.name.split(' ')[0].toLowerCase(); // e.g., "Curaleaf", "Sol"
|
||||
if (disp.name.toLowerCase().includes(brandName)) {
|
||||
score += 10;
|
||||
reasons.push('brand_match');
|
||||
}
|
||||
if (score > bestScore) {
|
||||
bestScore = score;
|
||||
bestMatch = disp;
|
||||
matchReason = reasons.join(', ');
|
||||
}
|
||||
}
|
||||
// Determine confidence level
|
||||
let confidence;
|
||||
if (bestScore >= 70) {
|
||||
confidence = 'high';
|
||||
}
|
||||
else if (bestScore >= 50) {
|
||||
confidence = 'medium';
|
||||
}
|
||||
else if (bestScore >= 30) {
|
||||
confidence = 'low';
|
||||
}
|
||||
else {
|
||||
confidence = 'none';
|
||||
}
|
||||
return {
|
||||
directoryStore: store,
|
||||
dispensaryId: bestMatch?.id || null,
|
||||
dispensaryName: bestMatch?.name || null,
|
||||
confidence,
|
||||
matchReason: matchReason || 'no_match',
|
||||
};
|
||||
}
|
||||
// ============================================================
|
||||
// MAIN FUNCTIONS
|
||||
// ============================================================
|
||||
/**
|
||||
* Run directory matching for a provider and update database
|
||||
* Only applies high-confidence matches automatically
|
||||
*/
|
||||
async function matchDirectoryToDispensaries(provider, dryRun = true) {
|
||||
console.log(`[DirectoryMatcher] Running ${provider} directory matching (dryRun=${dryRun})...`);
|
||||
// Get directory stores
|
||||
let directoryStores;
|
||||
if (provider === 'curaleaf') {
|
||||
directoryStores = await scrapeCuraleafDirectory();
|
||||
}
|
||||
else if (provider === 'sol') {
|
||||
directoryStores = await scrapeSolDirectory();
|
||||
}
|
||||
else {
|
||||
throw new Error(`Unknown provider: ${provider}`);
|
||||
}
|
||||
// Get all AZ dispensaries from database
|
||||
const { rows: dispensaries } = await (0, connection_1.query)(`SELECT id, name, city, state, address, menu_type, menu_url, website
|
||||
FROM dispensaries
|
||||
WHERE state = 'AZ'`);
|
||||
console.log(`[DirectoryMatcher] Matching ${directoryStores.length} directory stores against ${dispensaries.length} dispensaries`);
|
||||
// Match each directory store
|
||||
const results = [];
|
||||
for (const store of directoryStores) {
|
||||
const match = matchStoreToDispensary(store, dispensaries);
|
||||
results.push(match);
|
||||
// Only apply high-confidence matches if not dry run
|
||||
if (!dryRun && match.confidence === 'high' && match.dispensaryId) {
|
||||
await applyDirectoryMatch(match.dispensaryId, provider, store);
|
||||
}
|
||||
}
|
||||
// Count results
|
||||
const report = {
|
||||
provider,
|
||||
totalDirectoryStores: directoryStores.length,
|
||||
highConfidenceMatches: results.filter((r) => r.confidence === 'high').length,
|
||||
mediumConfidenceMatches: results.filter((r) => r.confidence === 'medium').length,
|
||||
lowConfidenceMatches: results.filter((r) => r.confidence === 'low').length,
|
||||
unmatched: results.filter((r) => r.confidence === 'none').length,
|
||||
results,
|
||||
};
|
||||
console.log(`[DirectoryMatcher] ${provider} matching complete:`);
|
||||
console.log(` - High confidence: ${report.highConfidenceMatches}`);
|
||||
console.log(` - Medium confidence: ${report.mediumConfidenceMatches}`);
|
||||
console.log(` - Low confidence: ${report.lowConfidenceMatches}`);
|
||||
console.log(` - Unmatched: ${report.unmatched}`);
|
||||
return report;
|
||||
}
|
||||
/**
|
||||
* Apply a directory match to a dispensary
|
||||
*/
|
||||
async function applyDirectoryMatch(dispensaryId, provider, store) {
|
||||
console.log(`[DirectoryMatcher] Applying match: dispensary ${dispensaryId} -> ${store.storeUrl}`);
|
||||
await (0, connection_1.query)(`
|
||||
UPDATE dispensaries SET
|
||||
menu_type = $1,
|
||||
menu_url = $2,
|
||||
platform_dispensary_id = NULL,
|
||||
provider_detection_data = COALESCE(provider_detection_data, '{}'::jsonb) ||
|
||||
jsonb_build_object(
|
||||
'detected_provider', $1::text,
|
||||
'detection_method', 'directory_match'::text,
|
||||
'detected_at', NOW(),
|
||||
'directory_store_name', $3::text,
|
||||
'directory_store_url', $2::text,
|
||||
'directory_store_city', $4::text,
|
||||
'directory_store_address', $5::text,
|
||||
'not_crawlable', true,
|
||||
'not_crawlable_reason', $6::text
|
||||
),
|
||||
updated_at = NOW()
|
||||
WHERE id = $7
|
||||
`, [
|
||||
provider,
|
||||
store.storeUrl,
|
||||
store.name,
|
||||
store.city,
|
||||
store.address,
|
||||
`${provider} proprietary menu - no crawler available`,
|
||||
dispensaryId,
|
||||
]);
|
||||
}
|
||||
/**
|
||||
* Preview matches without applying them
|
||||
*/
|
||||
async function previewDirectoryMatches(provider) {
|
||||
return matchDirectoryToDispensaries(provider, true);
|
||||
}
|
||||
/**
|
||||
* Apply high-confidence matches
|
||||
*/
|
||||
async function applyHighConfidenceMatches(provider) {
|
||||
return matchDirectoryToDispensaries(provider, false);
|
||||
}
|
||||
Reference in New Issue
Block a user