Files
cannaiq/backend/dist/utils/product-normalizer.js
Kelly 66e07b2009 fix(monitor): remove non-existent worker columns from job_run_logs query
The job_run_logs table tracks scheduled job orchestration, not individual
worker jobs. Worker info (worker_id, worker_hostname) belongs on
dispensary_crawl_jobs, not job_run_logs.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:45:05 -07:00

182 lines
5.8 KiB
JavaScript

"use strict";
/**
* Product Normalizer Utility
*
* Functions for normalizing product data to enable consistent matching
* and prevent duplicate product entries.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.normalizeProductName = normalizeProductName;
exports.normalizeBrandName = normalizeBrandName;
exports.normalizeWeight = normalizeWeight;
exports.generateProductFingerprint = generateProductFingerprint;
exports.stringSimilarity = stringSimilarity;
exports.areProductsSimilar = areProductsSimilar;
/**
* Normalize product name for matching
* - Lowercase
* - Remove punctuation
* - Remove THC/CBD percentages often appended to names
* - Remove weight suffixes
* - Remove emoji
* - Normalize whitespace
*/
function normalizeProductName(name) {
if (!name)
return '';
return name
.toLowerCase()
.trim()
// Remove special characters except alphanumeric and spaces
.replace(/[^\w\s]/g, ' ')
// Remove common suffixes like THC/CBD percentages appended to names
.replace(/\s*(thc|cbd|cbg|cbn|tac)\s*[:=]?\s*[\d.]+\s*%?/gi, '')
// Remove weight/size suffixes often appended
.replace(/\s*\d+(\.\d+)?\s*(mg|g|oz|ml|gram|grams|ounce|ounces)\b/gi, '')
// Remove emoji
.replace(/[\u{1F300}-\u{1F9FF}]/gu, '')
// Remove "special offer" type suffixes
.replace(/\s*special\s*offer\s*/gi, '')
// Normalize multiple spaces to single space
.replace(/\s+/g, ' ')
.trim();
}
/**
* Normalize brand name for matching
*/
function normalizeBrandName(brand) {
if (!brand)
return '';
return brand
.toLowerCase()
.trim()
// Remove special characters
.replace(/[^\w\s]/g, ' ')
// Normalize whitespace
.replace(/\s+/g, ' ')
.trim();
}
/**
* Normalize weight string to standard format
* e.g., "3.5 grams" -> "3.5g", "1/8 oz" -> "3.5g"
*/
function normalizeWeight(weight) {
if (!weight)
return '';
const w = weight.toLowerCase().trim();
// Handle fractional ounces
if (w.includes('1/8') || w.includes('eighth')) {
return '3.5g';
}
if (w.includes('1/4') || w.includes('quarter')) {
return '7g';
}
if (w.includes('1/2') || w.includes('half')) {
return '14g';
}
if (w.includes('1 oz') || w === 'oz' || w === '1oz') {
return '28g';
}
// Extract numeric value and unit
const match = w.match(/([\d.]+)\s*(mg|g|oz|ml|gram|grams?|ounce|ounces?)?/i);
if (!match)
return w;
const value = parseFloat(match[1]);
let unit = (match[2] || 'g').toLowerCase();
// Normalize unit names
unit = unit.replace(/gram(s)?/, 'g').replace(/ounce(s)?/, 'oz');
// Convert oz to grams for consistency
if (unit === 'oz') {
return `${(value * 28).toFixed(1)}g`;
}
return `${value}${unit}`;
}
/**
* Generate a matching fingerprint for a product
* Used for deduplication
*/
function generateProductFingerprint(name, brand, weight, categoryId) {
const parts = [
normalizeProductName(name),
normalizeBrandName(brand),
normalizeWeight(weight),
categoryId?.toString() || ''
];
return parts.filter(Boolean).join('|');
}
/**
* Calculate similarity between two strings (0-100)
* Uses Levenshtein distance
*/
function stringSimilarity(str1, str2) {
if (str1 === str2)
return 100;
if (!str1 || !str2)
return 0;
const s1 = str1.toLowerCase();
const s2 = str2.toLowerCase();
if (s1 === s2)
return 100;
const longer = s1.length > s2.length ? s1 : s2;
const shorter = s1.length > s2.length ? s2 : s1;
const longerLength = longer.length;
if (longerLength === 0)
return 100;
const distance = levenshteinDistance(longer, shorter);
return Math.round(((longerLength - distance) / longerLength) * 100);
}
/**
* Levenshtein distance between two strings
*/
function levenshteinDistance(str1, str2) {
const m = str1.length;
const n = str2.length;
// Create distance matrix
const dp = Array(m + 1).fill(null).map(() => Array(n + 1).fill(0));
// Initialize first row and column
for (let i = 0; i <= m; i++)
dp[i][0] = i;
for (let j = 0; j <= n; j++)
dp[0][j] = j;
// Fill in the rest
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
const cost = str1[i - 1] === str2[j - 1] ? 0 : 1;
dp[i][j] = Math.min(dp[i - 1][j] + 1, // deletion
dp[i][j - 1] + 1, // insertion
dp[i - 1][j - 1] + cost // substitution
);
}
}
return dp[m][n];
}
/**
* Check if two products are likely the same
* Returns confidence score (0-100)
*/
function areProductsSimilar(product1, product2, threshold = 92) {
const name1 = normalizeProductName(product1.name);
const name2 = normalizeProductName(product2.name);
const nameSimilarity = stringSimilarity(name1, name2);
// If names are very similar, likely same product
if (nameSimilarity >= threshold) {
return { isSimilar: true, confidence: nameSimilarity };
}
// Check brand match for additional confidence
const brand1 = normalizeBrandName(product1.brand);
const brand2 = normalizeBrandName(product2.brand);
if (brand1 && brand2 && brand1 === brand2) {
// Same brand, lower threshold for name match
if (nameSimilarity >= threshold - 10) {
return { isSimilar: true, confidence: nameSimilarity + 5 };
}
}
// Check weight match
const weight1 = normalizeWeight(product1.weight);
const weight2 = normalizeWeight(product2.weight);
if (weight1 && weight2 && weight1 === weight2 && nameSimilarity >= threshold - 15) {
return { isSimilar: true, confidence: nameSimilarity + 3 };
}
return { isSimilar: false, confidence: nameSimilarity };
}