The job_run_logs table tracks scheduled job orchestration, not individual worker jobs. Worker info (worker_id, worker_hostname) belongs on dispensary_crawl_jobs, not job_run_logs. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
182 lines
5.8 KiB
JavaScript
182 lines
5.8 KiB
JavaScript
"use strict";
|
|
/**
|
|
* Product Normalizer Utility
|
|
*
|
|
* Functions for normalizing product data to enable consistent matching
|
|
* and prevent duplicate product entries.
|
|
*/
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.normalizeProductName = normalizeProductName;
|
|
exports.normalizeBrandName = normalizeBrandName;
|
|
exports.normalizeWeight = normalizeWeight;
|
|
exports.generateProductFingerprint = generateProductFingerprint;
|
|
exports.stringSimilarity = stringSimilarity;
|
|
exports.areProductsSimilar = areProductsSimilar;
|
|
/**
|
|
* Normalize product name for matching
|
|
* - Lowercase
|
|
* - Remove punctuation
|
|
* - Remove THC/CBD percentages often appended to names
|
|
* - Remove weight suffixes
|
|
* - Remove emoji
|
|
* - Normalize whitespace
|
|
*/
|
|
function normalizeProductName(name) {
|
|
if (!name)
|
|
return '';
|
|
return name
|
|
.toLowerCase()
|
|
.trim()
|
|
// Remove special characters except alphanumeric and spaces
|
|
.replace(/[^\w\s]/g, ' ')
|
|
// Remove common suffixes like THC/CBD percentages appended to names
|
|
.replace(/\s*(thc|cbd|cbg|cbn|tac)\s*[:=]?\s*[\d.]+\s*%?/gi, '')
|
|
// Remove weight/size suffixes often appended
|
|
.replace(/\s*\d+(\.\d+)?\s*(mg|g|oz|ml|gram|grams|ounce|ounces)\b/gi, '')
|
|
// Remove emoji
|
|
.replace(/[\u{1F300}-\u{1F9FF}]/gu, '')
|
|
// Remove "special offer" type suffixes
|
|
.replace(/\s*special\s*offer\s*/gi, '')
|
|
// Normalize multiple spaces to single space
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
/**
|
|
* Normalize brand name for matching
|
|
*/
|
|
function normalizeBrandName(brand) {
|
|
if (!brand)
|
|
return '';
|
|
return brand
|
|
.toLowerCase()
|
|
.trim()
|
|
// Remove special characters
|
|
.replace(/[^\w\s]/g, ' ')
|
|
// Normalize whitespace
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
}
|
|
/**
|
|
* Normalize weight string to standard format
|
|
* e.g., "3.5 grams" -> "3.5g", "1/8 oz" -> "3.5g"
|
|
*/
|
|
function normalizeWeight(weight) {
|
|
if (!weight)
|
|
return '';
|
|
const w = weight.toLowerCase().trim();
|
|
// Handle fractional ounces
|
|
if (w.includes('1/8') || w.includes('eighth')) {
|
|
return '3.5g';
|
|
}
|
|
if (w.includes('1/4') || w.includes('quarter')) {
|
|
return '7g';
|
|
}
|
|
if (w.includes('1/2') || w.includes('half')) {
|
|
return '14g';
|
|
}
|
|
if (w.includes('1 oz') || w === 'oz' || w === '1oz') {
|
|
return '28g';
|
|
}
|
|
// Extract numeric value and unit
|
|
const match = w.match(/([\d.]+)\s*(mg|g|oz|ml|gram|grams?|ounce|ounces?)?/i);
|
|
if (!match)
|
|
return w;
|
|
const value = parseFloat(match[1]);
|
|
let unit = (match[2] || 'g').toLowerCase();
|
|
// Normalize unit names
|
|
unit = unit.replace(/gram(s)?/, 'g').replace(/ounce(s)?/, 'oz');
|
|
// Convert oz to grams for consistency
|
|
if (unit === 'oz') {
|
|
return `${(value * 28).toFixed(1)}g`;
|
|
}
|
|
return `${value}${unit}`;
|
|
}
|
|
/**
|
|
* Generate a matching fingerprint for a product
|
|
* Used for deduplication
|
|
*/
|
|
function generateProductFingerprint(name, brand, weight, categoryId) {
|
|
const parts = [
|
|
normalizeProductName(name),
|
|
normalizeBrandName(brand),
|
|
normalizeWeight(weight),
|
|
categoryId?.toString() || ''
|
|
];
|
|
return parts.filter(Boolean).join('|');
|
|
}
|
|
/**
|
|
* Calculate similarity between two strings (0-100)
|
|
* Uses Levenshtein distance
|
|
*/
|
|
function stringSimilarity(str1, str2) {
|
|
if (str1 === str2)
|
|
return 100;
|
|
if (!str1 || !str2)
|
|
return 0;
|
|
const s1 = str1.toLowerCase();
|
|
const s2 = str2.toLowerCase();
|
|
if (s1 === s2)
|
|
return 100;
|
|
const longer = s1.length > s2.length ? s1 : s2;
|
|
const shorter = s1.length > s2.length ? s2 : s1;
|
|
const longerLength = longer.length;
|
|
if (longerLength === 0)
|
|
return 100;
|
|
const distance = levenshteinDistance(longer, shorter);
|
|
return Math.round(((longerLength - distance) / longerLength) * 100);
|
|
}
|
|
/**
|
|
* Levenshtein distance between two strings
|
|
*/
|
|
function levenshteinDistance(str1, str2) {
|
|
const m = str1.length;
|
|
const n = str2.length;
|
|
// Create distance matrix
|
|
const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
|
|
// Initialize first row and column
|
|
for (let i = 0; i <= m; i++)
|
|
dp[i][0] = i;
|
|
for (let j = 0; j <= n; j++)
|
|
dp[0][j] = j;
|
|
// Fill in the rest
|
|
for (let i = 1; i <= m; i++) {
|
|
for (let j = 1; j <= n; j++) {
|
|
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
|
|
dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion
|
|
dp[i][j - 1] + 1, // insertion
|
|
dp[i - 1][j - 1] + cost // substitution
|
|
);
|
|
}
|
|
}
|
|
return dp[m][n];
|
|
}
|
|
/**
|
|
* Check if two products are likely the same
|
|
* Returns confidence score (0-100)
|
|
*/
|
|
function areProductsSimilar(product1, product2, threshold = 92) {
|
|
const name1 = normalizeProductName(product1.name);
|
|
const name2 = normalizeProductName(product2.name);
|
|
const nameSimilarity = stringSimilarity(name1, name2);
|
|
// If names are very similar, likely same product
|
|
if (nameSimilarity >= threshold) {
|
|
return { isSimilar: true, confidence: nameSimilarity };
|
|
}
|
|
// Check brand match for additional confidence
|
|
const brand1 = normalizeBrandName(product1.brand);
|
|
const brand2 = normalizeBrandName(product2.brand);
|
|
if (brand1 && brand2 && brand1 === brand2) {
|
|
// Same brand, lower threshold for name match
|
|
if (nameSimilarity >= threshold - 10) {
|
|
return { isSimilar: true, confidence: nameSimilarity + 5 };
|
|
}
|
|
}
|
|
// Check weight match
|
|
const weight1 = normalizeWeight(product1.weight);
|
|
const weight2 = normalizeWeight(product2.weight);
|
|
if (weight1 && weight2 && weight1 === weight2 && nameSimilarity >= threshold - 15) {
|
|
return { isSimilar: true, confidence: nameSimilarity + 3 };
|
|
}
|
|
return { isSimilar: false, confidence: nameSimilarity };
|
|
}
|