- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
170 lines
4.9 KiB
JavaScript
170 lines
4.9 KiB
JavaScript
const puppeteer = require('puppeteer-extra');
|
||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||
const { Pool } = require('pg');
|
||
|
||
puppeteer.use(StealthPlugin());
|
||
|
||
const pool = new Pool({
|
||
connectionString: 'postgresql://dutchie:dutchie_local_pass@localhost:54320/dutchie_menus'
|
||
});
|
||
|
||
async function scrapeArizonaStores() {
|
||
const browser = await puppeteer.launch({
|
||
headless: 'new',
|
||
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
||
});
|
||
|
||
try {
|
||
const page = await browser.newPage();
|
||
|
||
// Set a desktop user agent
|
||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
||
|
||
console.log('Navigating to Curaleaf stores page...');
|
||
await page.goto('https://curaleaf.com/stores/', {
|
||
waitUntil: 'networkidle2',
|
||
timeout: 30000
|
||
});
|
||
|
||
// Wait a bit for any dynamic content
|
||
await page.waitForTimeout(3000);
|
||
|
||
// Try to find Arizona stores
|
||
const stores = await page.evaluate(() => {
|
||
const results = [];
|
||
|
||
// Look for links that contain 'arizona' or 'az' in the URL
|
||
const links = Array.from(document.querySelectorAll('a[href*="/stores/"]'));
|
||
|
||
for (const link of links) {
|
||
const href = link.href;
|
||
const text = link.textContent.trim();
|
||
|
||
// Check if it's an Arizona store
|
||
if (href.includes('/stores/curaleaf') &&
|
||
(href.toLowerCase().includes('-az-') ||
|
||
href.toLowerCase().includes('arizona') ||
|
||
text.toLowerCase().includes('arizona') ||
|
||
text.toLowerCase().includes(', az'))) {
|
||
|
||
// Extract slug from URL
|
||
const match = href.match(/\/stores\/([^\/\?#]+)/);
|
||
if (match) {
|
||
results.push({
|
||
name: text,
|
||
slug: match[1],
|
||
url: href.split('?')[0].split('#')[0] // Remove query params and hash
|
||
});
|
||
}
|
||
}
|
||
}
|
||
|
||
return results;
|
||
});
|
||
|
||
console.log(`\nFound ${stores.length} Arizona stores on Curaleaf website:\n`);
|
||
|
||
// Remove duplicates based on slug
|
||
const uniqueStores = Array.from(
|
||
new Map(stores.map(s => [s.slug, s])).values()
|
||
);
|
||
|
||
uniqueStores.forEach((store, i) => {
|
||
console.log(`${i + 1}. ${store.name}`);
|
||
console.log(` Slug: ${store.slug}`);
|
||
console.log(` URL: ${store.url}\n`);
|
||
});
|
||
|
||
return uniqueStores;
|
||
|
||
} finally {
|
||
await browser.close();
|
||
}
|
||
}
|
||
|
||
async function compareWithDatabase(scrapedStores) {
|
||
const client = await pool.connect();
|
||
|
||
try {
|
||
// Get current stores from database
|
||
const result = await client.query(
|
||
"SELECT id, name, slug, dutchie_url FROM stores WHERE slug LIKE 'curaleaf%' AND slug LIKE '%az%' OR slug LIKE 'curaleaf-dispensary%'"
|
||
);
|
||
|
||
const dbStores = result.rows;
|
||
|
||
console.log('\n=== COMPARISON ===\n');
|
||
|
||
// Create maps for easy lookup
|
||
const scrapedMap = new Map(scrapedStores.map(s => [s.slug, s]));
|
||
const dbMap = new Map(dbStores.map(s => [s.slug, s]));
|
||
|
||
// Find stores that need updating
|
||
const updates = [];
|
||
|
||
for (const dbStore of dbStores) {
|
||
const scraped = scrapedMap.get(dbStore.slug);
|
||
|
||
if (!scraped) {
|
||
// Store in DB but not found on website
|
||
console.log(`⚠️ "${dbStore.name}" (${dbStore.slug}) - NOT FOUND on website`);
|
||
|
||
// Try to find by matching name
|
||
const matchByName = scrapedStores.find(s =>
|
||
s.name.toLowerCase().includes(dbStore.name.toLowerCase().replace('curaleaf - ', ''))
|
||
);
|
||
|
||
if (matchByName) {
|
||
console.log(` → Possible match: ${matchByName.slug}`);
|
||
updates.push({
|
||
id: dbStore.id,
|
||
oldSlug: dbStore.slug,
|
||
newSlug: matchByName.slug,
|
||
newUrl: matchByName.url,
|
||
name: dbStore.name
|
||
});
|
||
}
|
||
} else {
|
||
// Check if URL matches
|
||
if (dbStore.dutchie_url !== scraped.url) {
|
||
console.log(`✏️ "${dbStore.name}" - URL mismatch`);
|
||
console.log(` DB: ${dbStore.dutchie_url}`);
|
||
console.log(` Web: ${scraped.url}`);
|
||
} else {
|
||
console.log(`✅ "${dbStore.name}" - correct`);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Find stores on website but not in DB
|
||
for (const scraped of scrapedStores) {
|
||
if (!dbMap.has(scraped.slug)) {
|
||
console.log(`➕ "${scraped.name}" (${scraped.slug}) - ON WEBSITE but not in DB`);
|
||
}
|
||
}
|
||
|
||
if (updates.length > 0) {
|
||
console.log(`\n\nFound ${updates.length} stores that need updating. Apply updates? (This is a dry run, updates not applied)`);
|
||
updates.forEach(u => {
|
||
console.log(`\nUPDATE stores SET slug='${u.newSlug}', dutchie_url='${u.newUrl}' WHERE id=${u.id};`);
|
||
});
|
||
}
|
||
|
||
} finally {
|
||
client.release();
|
||
pool.end();
|
||
}
|
||
}
|
||
|
||
async function main() {
|
||
try {
|
||
const scrapedStores = await scrapeArizonaStores();
|
||
await compareWithDatabase(scrapedStores);
|
||
} catch (error) {
|
||
console.error('Error:', error);
|
||
pool.end();
|
||
}
|
||
}
|
||
|
||
main();
|