- Add backend stale process monitoring API (/api/stale-processes) - Add users management route - Add frontend landing page and stale process monitor UI on /scraper-tools - Move old development scripts to backend/archive/ - Update frontend build with new features 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
302 lines
11 KiB
JavaScript
302 lines
11 KiB
JavaScript
"use strict";
|
|
/**
|
|
* Platform ID Extractor - Standalone script for extracting Dutchie platform IDs
|
|
*
|
|
* This script visits dispensary websites to capture their Dutchie retailerId
|
|
* by intercepting network requests to the Dutchie GraphQL API.
|
|
*
|
|
* It does NOT use the main orchestrator - it's a standalone browser-based tool.
|
|
*/
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
const playwright_1 = require("playwright");
|
|
const pg_1 = require("pg");
|
|
const pool = new pg_1.Pool({
|
|
connectionString: process.env.DATABASE_URL
|
|
});
|
|
async function extractPlatformId(browser, dispensary) {
|
|
let capturedId = null;
|
|
let captureSource = null;
|
|
let errorMsg = null;
|
|
const context = await browser.newContext({
|
|
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
});
|
|
const page = await context.newPage();
|
|
// Patterns to match retailer IDs in various formats
|
|
const idPatterns = [
|
|
/["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i,
|
|
/["']dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i,
|
|
/retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i,
|
|
/dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i,
|
|
/dutchie\.com\/dispensary\/([a-f0-9]{24})/i,
|
|
/plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i,
|
|
/retailerId=([a-f0-9]{24})/i,
|
|
/\/([a-f0-9]{24})(?:\/|\?|$)/i, // Generic ID in URL path
|
|
];
|
|
// Intercept network requests
|
|
page.on('request', (request) => {
|
|
if (capturedId)
|
|
return;
|
|
const url = request.url();
|
|
if (url.includes('dutchie') || url.includes('api.dutchie')) {
|
|
// Check URL for retailer ID
|
|
for (const pattern of idPatterns) {
|
|
const match = url.match(pattern);
|
|
if (match && match[1] && match[1].length === 24) {
|
|
capturedId = match[1];
|
|
captureSource = 'request_url';
|
|
break;
|
|
}
|
|
}
|
|
// Check POST data
|
|
const postData = request.postData();
|
|
if (postData && !capturedId) {
|
|
for (const pattern of idPatterns) {
|
|
const match = postData.match(pattern);
|
|
if (match && match[1] && match[1].length === 24) {
|
|
capturedId = match[1];
|
|
captureSource = 'request_body';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
});
|
|
try {
|
|
console.log(`\n[${dispensary.id}] ${dispensary.name}: ${dispensary.website}`);
|
|
// Load main page
|
|
await page.goto(dispensary.website, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 25000
|
|
});
|
|
await page.waitForTimeout(2000);
|
|
// Check page content
|
|
if (!capturedId) {
|
|
const content = await page.content();
|
|
for (const pattern of idPatterns) {
|
|
const match = content.match(pattern);
|
|
if (match && match[1] && match[1].length === 24) {
|
|
capturedId = match[1];
|
|
captureSource = 'page_content';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// Check __NEXT_DATA__
|
|
if (!capturedId) {
|
|
const nextData = await page.evaluate(() => {
|
|
const el = document.getElementById('__NEXT_DATA__');
|
|
return el?.textContent || null;
|
|
});
|
|
if (nextData) {
|
|
for (const pattern of idPatterns) {
|
|
const match = nextData.match(pattern);
|
|
if (match && match[1] && match[1].length === 24) {
|
|
capturedId = match[1];
|
|
captureSource = '__NEXT_DATA__';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Check iframes
|
|
if (!capturedId) {
|
|
const iframes = await page.evaluate(() => {
|
|
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
|
|
});
|
|
for (const src of iframes) {
|
|
if (src.includes('dutchie')) {
|
|
const match = src.match(/([a-f0-9]{24})/i);
|
|
if (match) {
|
|
capturedId = match[1];
|
|
captureSource = 'iframe_src';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Check scripts
|
|
if (!capturedId) {
|
|
const scripts = await page.evaluate(() => {
|
|
return Array.from(document.querySelectorAll('script'))
|
|
.map(s => s.src || s.innerHTML?.substring(0, 1000))
|
|
.filter(Boolean);
|
|
});
|
|
for (const script of scripts) {
|
|
if (script && (script.includes('dutchie') || script.includes('retailerId'))) {
|
|
for (const pattern of idPatterns) {
|
|
const match = script.match(pattern);
|
|
if (match && match[1] && match[1].length === 24) {
|
|
capturedId = match[1];
|
|
captureSource = 'script';
|
|
break;
|
|
}
|
|
}
|
|
if (capturedId)
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
// Try navigating to menu/shop page
|
|
if (!capturedId) {
|
|
const menuLink = await page.evaluate(() => {
|
|
const links = Array.from(document.querySelectorAll('a'));
|
|
for (const link of links) {
|
|
const href = link.href?.toLowerCase() || '';
|
|
const text = link.textContent?.toLowerCase() || '';
|
|
if (href.includes('menu') || href.includes('shop') || href.includes('order') ||
|
|
text.includes('menu') || text.includes('shop') || text.includes('order')) {
|
|
return link.href;
|
|
}
|
|
}
|
|
return null;
|
|
});
|
|
if (menuLink && !menuLink.startsWith('javascript:')) {
|
|
try {
|
|
console.log(` -> Following menu link: ${menuLink.substring(0, 60)}...`);
|
|
await page.goto(menuLink, { waitUntil: 'domcontentloaded', timeout: 20000 });
|
|
await page.waitForTimeout(3000);
|
|
// Recheck all sources on new page
|
|
const newContent = await page.content();
|
|
for (const pattern of idPatterns) {
|
|
const match = newContent.match(pattern);
|
|
if (match && match[1] && match[1].length === 24) {
|
|
capturedId = match[1];
|
|
captureSource = 'menu_page_content';
|
|
break;
|
|
}
|
|
}
|
|
// Check iframes on new page
|
|
if (!capturedId) {
|
|
const newIframes = await page.evaluate(() => {
|
|
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
|
|
});
|
|
for (const src of newIframes) {
|
|
if (src.includes('dutchie')) {
|
|
const match = src.match(/([a-f0-9]{24})/i);
|
|
if (match) {
|
|
capturedId = match[1];
|
|
captureSource = 'menu_page_iframe';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
catch (navError) {
|
|
// Menu navigation failed, continue
|
|
}
|
|
}
|
|
}
|
|
// Final wait for async content
|
|
if (!capturedId) {
|
|
await page.waitForTimeout(3000);
|
|
// Final iframe check
|
|
const finalIframes = await page.evaluate(() => {
|
|
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
|
|
});
|
|
for (const src of finalIframes) {
|
|
if (src.includes('dutchie')) {
|
|
const match = src.match(/([a-f0-9]{24})/i);
|
|
if (match) {
|
|
capturedId = match[1];
|
|
captureSource = 'delayed_iframe';
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if (capturedId) {
|
|
console.log(` ✓ Found: ${capturedId} (${captureSource})`);
|
|
}
|
|
else {
|
|
console.log(` ✗ Not found`);
|
|
}
|
|
}
|
|
catch (e) {
|
|
errorMsg = e.message.substring(0, 100);
|
|
console.log(` ✗ Error: ${errorMsg}`);
|
|
}
|
|
finally {
|
|
await context.close();
|
|
}
|
|
return {
|
|
id: dispensary.id,
|
|
name: dispensary.name,
|
|
website: dispensary.website,
|
|
platformId: capturedId,
|
|
source: captureSource,
|
|
error: errorMsg
|
|
};
|
|
}
|
|
async function main() {
|
|
// Get specific dispensary ID from command line, or process all missing
|
|
const targetId = process.argv[2] ? parseInt(process.argv[2], 10) : null;
|
|
let query;
|
|
let params = [];
|
|
if (targetId) {
|
|
query = `
|
|
SELECT id, name, website
|
|
FROM dispensaries
|
|
WHERE id = $1
|
|
AND website IS NOT NULL AND website != ''
|
|
`;
|
|
params = [targetId];
|
|
}
|
|
else {
|
|
query = `
|
|
SELECT id, name, website
|
|
FROM dispensaries
|
|
WHERE state = 'AZ'
|
|
AND menu_type = 'dutchie'
|
|
AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '')
|
|
AND website IS NOT NULL AND website != ''
|
|
ORDER BY name
|
|
`;
|
|
}
|
|
const result = await pool.query(query, params);
|
|
if (result.rows.length === 0) {
|
|
console.log('No dispensaries to process');
|
|
await pool.end();
|
|
return;
|
|
}
|
|
console.log(`\n=== Platform ID Extractor ===`);
|
|
console.log(`Processing ${result.rows.length} dispensaries...\n`);
|
|
const browser = await playwright_1.chromium.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
});
|
|
const results = [];
|
|
for (const dispensary of result.rows) {
|
|
const extractionResult = await extractPlatformId(browser, dispensary);
|
|
results.push(extractionResult);
|
|
// Update database immediately if found
|
|
if (extractionResult.platformId) {
|
|
await pool.query('UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2', [extractionResult.platformId, extractionResult.id]);
|
|
}
|
|
}
|
|
await browser.close();
|
|
// Summary
|
|
console.log('\n' + '='.repeat(60));
|
|
console.log('SUMMARY');
|
|
console.log('='.repeat(60));
|
|
const found = results.filter(r => r.platformId);
|
|
const notFound = results.filter(r => !r.platformId);
|
|
console.log(`\nFound: ${found.length}/${results.length}`);
|
|
if (found.length > 0) {
|
|
console.log('\nSuccessful extractions:');
|
|
found.forEach(r => console.log(` [${r.id}] ${r.name} -> ${r.platformId} (${r.source})`));
|
|
}
|
|
if (notFound.length > 0) {
|
|
console.log(`\nNot found: ${notFound.length}`);
|
|
notFound.forEach(r => {
|
|
const reason = r.error || 'No Dutchie ID detected';
|
|
console.log(` [${r.id}] ${r.name}: ${reason}`);
|
|
});
|
|
}
|
|
await pool.end();
|
|
}
|
|
main().catch(e => {
|
|
console.error('Fatal error:', e);
|
|
process.exit(1);
|
|
});
|