feat: Add stale process monitor, users route, landing page, archive old scripts

- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-05 04:07:31 -07:00
parent d2d44d2aeb
commit d91c55a344
3115 changed files with 5755 additions and 719 deletions

View File

@@ -0,0 +1,72 @@
import { Pool } from 'pg';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
/**
* Creates `stores` table records for all dispensaries that:
* 1. Have menu_type = 'dutchie' AND platform_dispensary_id (ready for GraphQL crawl)
* 2. Don't already have a linked stores record
*
* The stores table is required by the scraper engine (scrapeStore function)
*/
async function bootstrapStores() {
console.log('=== Bootstrapping stores for Dutchie dispensaries ===\n');
// Find all dutchie dispensaries without linked stores
const result = await pool.query(`
SELECT d.id, d.name, d.slug, d.menu_type, d.platform_dispensary_id, d.menu_url
FROM dispensaries d
LEFT JOIN stores s ON s.dispensary_id = d.id
WHERE d.menu_type = 'dutchie'
AND d.platform_dispensary_id IS NOT NULL
AND s.id IS NULL
ORDER BY d.id
`);
console.log(`Found ${result.rows.length} dispensaries needing store records\n`);
let created = 0;
let errors = 0;
for (const d of result.rows) {
try {
// Insert store record linking to dispensary
// Note: stores table only has basic fields: name, slug, dispensary_id, dutchie_url
// The platform_dispensary_id for GraphQL crawling lives in the dispensaries table
const insertResult = await pool.query(`
INSERT INTO stores (
name,
slug,
dispensary_id,
active,
scrape_enabled,
created_at,
updated_at
) VALUES ($1, $2, $3, true, true, NOW(), NOW())
RETURNING id
`, [
d.name,
d.slug || d.name.toLowerCase().replace(/[^a-z0-9]+/g, '-'),
d.id
]);
console.log(`[CREATED] Store ${insertResult.rows[0].id} for dispensary ${d.id}: ${d.name}`);
created++;
} catch (e: any) {
console.error(`[ERROR] Dispensary ${d.id} (${d.name}): ${e.message}`);
errors++;
}
}
console.log('\n=== Bootstrap Summary ===');
console.log(`Created: ${created}`);
console.log(`Errors: ${errors}`);
console.log(`Total needing stores: ${result.rows.length}`);
await pool.end();
}
bootstrapStores().catch(e => {
console.error('Fatal error:', e.message);
process.exit(1);
});

View File

@@ -0,0 +1,35 @@
import { Pool } from 'pg';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
async function check() {
// Check which dispensaries have linked stores
const result = await pool.query(`
SELECT d.id as disp_id, d.name, d.menu_type, d.platform_dispensary_id,
s.id as store_id, s.name as store_name
FROM dispensaries d
LEFT JOIN stores s ON s.dispensary_id = d.id
WHERE d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL
LIMIT 15
`);
console.log('Dispensaries with linked stores:');
result.rows.forEach(r => {
console.log(` [${r.disp_id}] ${r.name} -> store ${r.store_id || 'NONE'} (${r.store_name || 'NOT LINKED'})`);
});
// Count how many have linked stores
const countResult = await pool.query(`
SELECT
COUNT(*) FILTER (WHERE s.id IS NOT NULL) as with_store,
COUNT(*) FILTER (WHERE s.id IS NULL) as without_store
FROM dispensaries d
LEFT JOIN stores s ON s.dispensary_id = d.id
WHERE d.menu_type = 'dutchie' AND d.platform_dispensary_id IS NOT NULL
`);
console.log('\nSummary:', countResult.rows[0]);
await pool.end();
}
check();

View File

@@ -0,0 +1,130 @@
import { Pool } from 'pg';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
// Simple fetch with timeout
async function fetchWithTimeout(url: string, timeout = 10000): Promise<string> {
const controller = new AbortController();
const id = setTimeout(() => controller.abort(), timeout);
try {
const resp = await fetch(url, {
signal: controller.signal,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
},
redirect: 'follow',
});
clearTimeout(id);
return await resp.text();
} catch (e) {
clearTimeout(id);
throw e;
}
}
// Check for dutchie patterns in HTML
function detectDutchie(html: string): { provider: string; platformId?: string; menuUrl?: string } {
// Check for reactEnv.dispensaryId (Curaleaf/Sol pattern)
const reactEnvMatch = html.match(/"dispensaryId"\s*:\s*"([a-fA-F0-9]{24})"/i);
if (reactEnvMatch) {
return { provider: 'dutchie', platformId: reactEnvMatch[1] };
}
// Check for Dutchie embedded-menu script (Trulieve pattern)
// Look for: embedded-menu/5eaf48fc972e6200b1303b97.js
const embedMatch = html.match(/embedded-menu\/([a-f0-9]{24})(?:\.js)?/i);
if (embedMatch) {
return { provider: 'dutchie', platformId: embedMatch[1] };
}
// Check for dutchie.com links
const dutchieLink = html.match(/https?:\/\/(?:www\.)?dutchie\.com\/(?:dispensary|embedded-menu|stores)\/([a-zA-Z0-9-]+)/i);
if (dutchieLink) {
return { provider: 'dutchie', menuUrl: dutchieLink[0] };
}
// Check for jane
if (html.includes('iheartjane.com') || html.includes('jane.co')) {
const janeMatch = html.match(/https?:\/\/(?:www\.)?(?:iheartjane\.com|jane\.co)\/[^"\s]+/i);
return { provider: 'jane', menuUrl: janeMatch?.[0] };
}
// Check for treez
if (html.includes('.treez.io')) {
const treezMatch = html.match(/https?:\/\/[a-zA-Z0-9-]+\.treez\.io[^"\s]*/i);
return { provider: 'treez', menuUrl: treezMatch?.[0] };
}
// Check for leafly
if (html.includes('leafly.com/dispensary')) {
return { provider: 'leafly' };
}
return { provider: 'unknown' };
}
async function main() {
const { rows: stores } = await pool.query(`
SELECT id, name, website
FROM dispensaries
WHERE platform_dispensary_id IS NULL
AND website IS NOT NULL
AND website NOT LIKE '%example%'
ORDER BY id
LIMIT 150
`);
console.log('Checking ' + stores.length + ' stores...\n');
let dutchieCount = 0;
let otherCount = 0;
let errorCount = 0;
for (const store of stores) {
try {
const html = await fetchWithTimeout(store.website);
const result = detectDutchie(html);
if (result.provider === 'dutchie') {
if (result.platformId) {
await pool.query(
'UPDATE dispensaries SET menu_type = $1, platform_dispensary_id = $2, updated_at = NOW() WHERE id = $3',
['dutchie', result.platformId, store.id]
);
console.log('[' + store.id + '] ' + store.name + ' => DUTCHIE (ID: ' + result.platformId + ')');
dutchieCount++;
} else if (result.menuUrl) {
await pool.query(
'UPDATE dispensaries SET menu_type = $1, menu_url = $2, updated_at = NOW() WHERE id = $3',
['dutchie', result.menuUrl, store.id]
);
console.log('[' + store.id + '] ' + store.name + ' => DUTCHIE (URL: ' + result.menuUrl.slice(0, 60) + ')');
dutchieCount++;
}
} else if (result.provider !== 'unknown') {
await pool.query(
'UPDATE dispensaries SET menu_type = $1, menu_url = COALESCE($2, menu_url), updated_at = NOW() WHERE id = $3',
[result.provider, result.menuUrl, store.id]
);
console.log('[' + store.id + '] ' + store.name + ' => ' + result.provider.toUpperCase());
otherCount++;
} else {
console.log('[' + store.id + '] ' + store.name + ' => no menu found');
}
} catch (err: any) {
const errMsg = err.name === 'AbortError' ? 'timeout' : err.message?.slice(0, 40) || 'error';
console.log('[' + store.id + '] ' + store.name + ' => ERROR: ' + errMsg);
errorCount++;
}
}
console.log('\n=== Summary ===');
console.log('Dutchie detected: ' + dutchieCount);
console.log('Other providers: ' + otherCount);
console.log('Errors: ' + errorCount);
await pool.end();
}
main().catch(console.error);

View File

@@ -0,0 +1,19 @@
import { Pool } from 'pg';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
async function exportDispensaries() {
const { rows } = await pool.query(`
SELECT id, name, dba_name, company_name, slug,
address, city, state, zip, latitude, longitude,
website, menu_type, menu_url, platform_dispensary_id,
created_at, updated_at
FROM dispensaries
WHERE menu_type IS NOT NULL
ORDER BY id
`);
console.log(JSON.stringify(rows, null, 2));
await pool.end();
}
exportDispensaries();

View File

@@ -0,0 +1,278 @@
import { chromium } from 'playwright';
import { Pool } from 'pg';
const pool = new Pool({
connectionString: process.env.DATABASE_URL
});
interface Dispensary {
id: number;
name: string;
website: string;
}
async function extractPlatformId(browser: any, dispensary: Dispensary): Promise<string | null> {
let capturedId: string | null = null;
const context = await browser.newContext();
const page = await context.newPage();
// Intercept network requests to find retailer IDs
page.on('request', (request: any) => {
const url = request.url();
if (url.includes('dutchie') || url.includes('plus.dutchie') || url.includes('api.dutchie')) {
// Check URL for retailer ID
const urlMatch = url.match(/[\/=]([a-f0-9]{24})(?:[\/\?&]|$)/i);
if (urlMatch && !capturedId) {
capturedId = urlMatch[1];
console.log(` Captured from URL: ${capturedId}`);
}
const postData = request.postData();
if (postData) {
// Look for retailerId in GraphQL variables
const match = postData.match(/["']?retailerId["']?\s*:\s*["']([a-f0-9]{24})["']/i);
if (match && !capturedId) {
capturedId = match[1];
console.log(` Captured retailerId: ${capturedId}`);
}
// Also look for dispensaryId
const dispMatch = postData.match(/["']?dispensaryId["']?\s*:\s*["']([a-f0-9]{24})["']/i);
if (dispMatch && !capturedId) {
capturedId = dispMatch[1];
console.log(` Captured dispensaryId: ${capturedId}`);
}
}
}
});
try {
console.log(`\nLoading ${dispensary.name}: ${dispensary.website}`);
await page.goto(dispensary.website, { waitUntil: 'domcontentloaded', timeout: 30000 });
// Wait for initial load
await page.waitForTimeout(2000);
// Check page content for retailerId
const content = await page.content();
// Try various patterns in page content
const patterns = [
/["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i,
/dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i,
/retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i,
/dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i,
/dutchie\.com\/dispensary\/([a-f0-9]{24})/i,
/plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i,
/retailerId=([a-f0-9]{24})/i,
];
for (const pattern of patterns) {
const match = content.match(pattern);
if (match && !capturedId) {
capturedId = match[1];
console.log(` Found in content: ${capturedId}`);
break;
}
}
// Check __NEXT_DATA__ if present
if (!capturedId) {
const nextData = await page.evaluate(() => {
const el = document.getElementById('__NEXT_DATA__');
return el?.textContent || null;
});
if (nextData) {
for (const pattern of patterns) {
const match = nextData.match(pattern);
if (match) {
capturedId = match[1];
console.log(` Found in __NEXT_DATA__: ${capturedId}`);
break;
}
}
}
}
// Look for iframes that might contain dutchie embed
if (!capturedId) {
const iframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
});
for (const src of iframes) {
if (src.includes('dutchie')) {
const match = src.match(/([a-f0-9]{24})/i);
if (match) {
capturedId = match[1];
console.log(` Found in iframe: ${capturedId}`);
break;
}
}
}
}
// If still not found, try clicking on "Shop" or "Menu" links
if (!capturedId) {
const menuSelectors = [
'a:has-text("Shop")',
'a:has-text("Menu")',
'a:has-text("Order")',
'a[href*="menu"]',
'a[href*="shop"]',
'a[href*="order"]',
'button:has-text("Shop")',
'button:has-text("Menu")',
];
for (const selector of menuSelectors) {
try {
const element = page.locator(selector).first();
const isVisible = await element.isVisible({ timeout: 500 });
if (isVisible) {
const href = await element.getAttribute('href');
// If it's an internal link, click it
if (href && !href.startsWith('http')) {
console.log(` Clicking ${selector}...`);
await element.click();
await page.waitForTimeout(3000);
// Check new page content
const newContent = await page.content();
for (const pattern of patterns) {
const match = newContent.match(pattern);
if (match && !capturedId) {
capturedId = match[1];
console.log(` Found after navigation: ${capturedId}`);
break;
}
}
// Check iframes on new page
if (!capturedId) {
const newIframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
});
for (const src of newIframes) {
if (src.includes('dutchie')) {
const match = src.match(/([a-f0-9]{24})/i);
if (match) {
capturedId = match[1];
console.log(` Found in iframe after nav: ${capturedId}`);
break;
}
}
}
}
if (capturedId) break;
}
}
} catch (e) {
// Continue to next selector
}
}
}
// If still not found, wait longer for async dutchie widget to load
if (!capturedId) {
console.log(` Waiting for async content...`);
await page.waitForTimeout(5000);
// Check for dutchie script tags
const scripts = await page.evaluate(() => {
return Array.from(document.querySelectorAll('script')).map(s => s.src || s.innerHTML?.substring(0, 500));
});
for (const script of scripts) {
if (script && script.includes('dutchie')) {
for (const pattern of patterns) {
const match = script.match(pattern);
if (match && !capturedId) {
capturedId = match[1];
console.log(` Found in script: ${capturedId}`);
break;
}
}
if (capturedId) break;
}
}
// Final check of iframes after wait
if (!capturedId) {
const finalIframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
});
for (const src of finalIframes) {
if (src.includes('dutchie')) {
const match = src.match(/([a-f0-9]{24})/i);
if (match) {
capturedId = match[1];
console.log(` Found in iframe (delayed): ${capturedId}`);
break;
}
}
}
}
}
} catch (e: any) {
console.log(` Error: ${e.message.substring(0, 80)}`);
} finally {
await context.close();
}
return capturedId;
}
async function main() {
// Get dispensaries missing platform IDs
const result = await pool.query(`
SELECT id, name, website
FROM dispensaries
WHERE state = 'AZ'
AND menu_type = 'dutchie'
AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '')
AND website IS NOT NULL AND website != ''
ORDER BY name
`);
console.log(`Found ${result.rows.length} dispensaries to process\n`);
const browser = await chromium.launch({ headless: true });
const results: { id: number; name: string; platformId: string | null }[] = [];
for (const dispensary of result.rows) {
const platformId = await extractPlatformId(browser, dispensary);
results.push({ id: dispensary.id, name: dispensary.name, platformId });
if (platformId) {
// Update database
await pool.query(
'UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2',
[platformId, dispensary.id]
);
console.log(` Updated database with ${platformId}`);
}
}
await browser.close();
console.log('\n=== SUMMARY ===');
const found = results.filter(r => r.platformId);
const notFound = results.filter(r => !r.platformId);
console.log(`\nFound (${found.length}):`);
found.forEach(r => console.log(` ${r.id}: ${r.name} -> ${r.platformId}`));
console.log(`\nNot Found (${notFound.length}):`);
notFound.forEach(r => console.log(` ${r.id}: ${r.name}`));
await pool.end();
}
main().catch(e => {
console.error('Error:', e);
process.exit(1);
});

View File

@@ -0,0 +1,83 @@
import { Pool } from 'pg';
import * as fs from 'fs';
const pool = new Pool({ connectionString: process.env.DATABASE_URL });
async function importDispensaries(filePath: string) {
const data = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
console.log(`Importing ${data.length} dispensaries...`);
let inserted = 0;
let updated = 0;
let errors = 0;
for (const d of data) {
try {
// Check if dispensary exists by name and city
const { rows: existing } = await pool.query(
`SELECT id FROM dispensaries WHERE name = $1 AND city = $2`,
[d.name, d.city]
);
if (existing.length > 0) {
// Update existing
await pool.query(`
UPDATE dispensaries SET
dba_name = COALESCE($1, dba_name),
company_name = COALESCE($2, company_name),
slug = COALESCE($3, slug),
address = COALESCE($4, address),
state = COALESCE($5, state),
zip = COALESCE($6, zip),
latitude = COALESCE($7, latitude),
longitude = COALESCE($8, longitude),
website = COALESCE($9, website),
menu_type = COALESCE($10, menu_type),
menu_url = COALESCE($11, menu_url),
platform_dispensary_id = COALESCE($12, platform_dispensary_id),
updated_at = NOW()
WHERE id = $13
`, [
d.dba_name, d.company_name, d.slug,
d.address, d.state, d.zip,
d.latitude, d.longitude, d.website,
d.menu_type, d.menu_url, d.platform_dispensary_id,
existing[0].id
]);
console.log(`Updated: [${existing[0].id}] ${d.name} (${d.city})`);
updated++;
} else {
// Insert new
const { rows: newRow } = await pool.query(`
INSERT INTO dispensaries (
name, dba_name, company_name, slug,
address, city, state, zip, latitude, longitude,
website, menu_type, menu_url, platform_dispensary_id,
created_at, updated_at
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, NOW(), NOW())
RETURNING id
`, [
d.name, d.dba_name, d.company_name, d.slug,
d.address, d.city, d.state, d.zip, d.latitude, d.longitude,
d.website, d.menu_type, d.menu_url, d.platform_dispensary_id
]);
console.log(`Inserted: [${newRow[0].id}] ${d.name} (${d.city})`);
inserted++;
}
} catch (err: any) {
console.error(`Error for ${d.name}: ${err.message}`);
errors++;
}
}
console.log(`\n=== Import Summary ===`);
console.log(`Inserted: ${inserted}`);
console.log(`Updated: ${updated}`);
console.log(`Errors: ${errors}`);
await pool.end();
}
const filePath = process.argv[2] || '/tmp/dispensaries-export.json';
importDispensaries(filePath).catch(console.error);

View File

@@ -0,0 +1,133 @@
import { chromium } from 'playwright';
async function extractJarsAzStoreIds() {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
const results: { name: string; retailerId: string; url: string }[] = [];
const capturedIds: string[] = [];
const allRequests: string[] = [];
// Intercept network requests to find Dutchie Plus API calls
page.on('request', (request) => {
const url = request.url();
allRequests.push(url.substring(0, 100));
if (url.includes('dutchie') || url.includes('graphql')) {
const postData = request.postData();
console.log('Dutchie request to:', url.substring(0, 80));
if (postData) {
// Look for retailerId in GraphQL variables
const match = postData.match(/"retailerId"\s*:\s*"([a-f0-9-]{36})"/i);
if (match) {
const id = match[1];
if (capturedIds.indexOf(id) === -1) {
capturedIds.push(id);
console.log('Captured retailerId from request:', id);
}
}
}
}
});
try {
// Just load one page first and thoroughly debug it
console.log('Loading Mesa store with full network debugging...');
await page.goto('https://jarscannabis.com/shop/mesa-az/', {
waitUntil: 'networkidle',
timeout: 60000
});
console.log('\nWaiting 5 seconds for dynamic content...');
await page.waitForTimeout(5000);
// Get page title and content
const title = await page.title();
console.log('Page title:', title);
const content = await page.content();
console.log('Page content length:', content.length);
// Save screenshot
await page.screenshot({ path: '/tmp/jars-mesa-debug.png', fullPage: true });
console.log('Screenshot saved to /tmp/jars-mesa-debug.png');
// Look for all UUIDs in content
const uuidPattern = /[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}/gi;
const uuids = content.match(uuidPattern);
if (uuids) {
const uniqueUuids = [...new Set(uuids)];
console.log('\n=== All UUIDs found on page ===');
uniqueUuids.forEach(u => console.log(u));
}
// Look for all iframes
const iframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => ({
src: f.src,
id: f.id,
name: f.name,
className: f.className
}));
});
console.log('\n=== Iframes ===');
console.log(JSON.stringify(iframes, null, 2));
// Look for any elements with dutchie
const dutchieElements = await page.evaluate(() => {
const elements = document.body.innerHTML.match(/dutchie[^<>]*\"/gi) || [];
return elements.slice(0, 20);
});
console.log('\n=== Dutchie mentions ===');
dutchieElements.forEach(e => console.log(e));
// Look for script src containing dutchie
const scripts = await page.evaluate(() => {
return Array.from(document.querySelectorAll('script[src]'))
.map(s => s.getAttribute('src'))
.filter(src => src && (src.includes('dutchie') || src.includes('embed')));
});
console.log('\n=== Relevant scripts ===');
scripts.forEach(s => console.log(s));
// Look for __NEXT_DATA__
const nextData = await page.evaluate(() => {
const el = document.getElementById('__NEXT_DATA__');
return el ? el.textContent : null;
});
if (nextData) {
console.log('\n=== __NEXT_DATA__ found ===');
const data = JSON.parse(nextData);
// Look for retailer in various places
const propsStr = JSON.stringify(data, null, 2);
// Find all UUID patterns in the props
const propsUuids = propsStr.match(/[a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12}/gi);
if (propsUuids) {
console.log('UUIDs in __NEXT_DATA__:', [...new Set(propsUuids)]);
}
} else {
console.log('\nNo __NEXT_DATA__ found');
}
// Look for specific Dutchie embed patterns
const embedPatterns = content.match(/https:\/\/[^"'\s]*dutchie[^"'\s]*/gi);
if (embedPatterns) {
console.log('\n=== Dutchie embed URLs ===');
[...new Set(embedPatterns)].forEach(u => console.log(u));
}
console.log('\n=== Network requests summary ===');
console.log('Total requests:', allRequests.length);
const dutchieRequests = allRequests.filter(r => r.includes('dutchie'));
console.log('Dutchie requests:', dutchieRequests.length);
dutchieRequests.forEach(r => console.log(r));
console.log('\n=== CAPTURED IDS ===');
console.log(capturedIds);
} finally {
await browser.close();
}
}
extractJarsAzStoreIds().catch(e => console.error('Error:', e.message));

View File

@@ -0,0 +1,197 @@
import { chromium } from 'playwright';
async function findJarsAzStores() {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
const capturedRetailerIds: { url: string; retailerId: string }[] = [];
const allApiCalls: string[] = [];
// Intercept ALL requests to find retailer IDs
page.on('request', (request) => {
const url = request.url();
// Log Buddy API calls
if (url.includes('buddyapi') || url.includes('dutchie') || url.includes('graphql')) {
allApiCalls.push(url);
const postData = request.postData();
if (postData) {
// Look for retailerId in various formats
const match = postData.match(/retailerId['":\s]+([a-f0-9-]{36})/i);
if (match) {
capturedRetailerIds.push({ url, retailerId: match[1] });
}
}
// Also check URL params
const urlMatch = url.match(/retailerId=([a-f0-9-]{36})/i);
if (urlMatch) {
capturedRetailerIds.push({ url, retailerId: urlMatch[1] });
}
}
});
try {
// First, let's try to find the actual Arizona menu URLs
console.log('Loading JARS find-a-dispensary page...');
await page.goto('https://jarscannabis.com/find-a-dispensary', {
waitUntil: 'networkidle',
timeout: 30000
});
await page.waitForTimeout(3000);
// Take screenshot
await page.screenshot({ path: '/tmp/jars-find-dispensary.png', fullPage: true });
console.log('Screenshot saved to /tmp/jars-find-dispensary.png');
// Try to find state selector and click Arizona
console.log('\nLooking for state selector...');
// Try various ways to select Arizona
const stateSelectors = [
'select[name*="state"]',
'[class*="state"] select',
'select option[value="AZ"]',
'button:has-text("Arizona")',
'a:has-text("Arizona")',
'[data-state="AZ"]',
'div:has-text("Arizona")',
];
for (const selector of stateSelectors) {
try {
const element = page.locator(selector).first();
const isVisible = await element.isVisible({ timeout: 1000 });
if (isVisible) {
console.log(`Found element with selector: ${selector}`);
await element.click();
await page.waitForTimeout(2000);
}
} catch (e) {
// Continue to next selector
}
}
// Get all links on the page
const links = await page.evaluate(() => {
return Array.from(document.querySelectorAll('a')).map(a => ({
href: a.href,
text: a.textContent?.trim()
})).filter(l => l.href.includes('/shop') || l.href.includes('menu') || l.href.includes('arizona') || l.href.includes('-az'));
});
console.log('\n=== Shop/Menu Links Found ===');
links.forEach(l => console.log(`${l.text}: ${l.href}`));
// Look for __NEXT_DATA__ which might have location data
const nextData = await page.evaluate(() => {
const el = document.getElementById('__NEXT_DATA__');
return el?.textContent || null;
});
if (nextData) {
console.log('\n=== Analyzing __NEXT_DATA__ ===');
const data = JSON.parse(nextData);
const dataStr = JSON.stringify(data);
// Look for Arizona references
if (dataStr.includes('Arizona') || dataStr.includes('AZ')) {
console.log('Found Arizona references in __NEXT_DATA__');
// Extract all objects that might be Arizona stores
const findArizonaStores = (obj: any, path: string = ''): any[] => {
const results: any[] = [];
if (!obj || typeof obj !== 'object') return results;
if (Array.isArray(obj)) {
obj.forEach((item, i) => {
results.push(...findArizonaStores(item, `${path}[${i}]`));
});
} else {
// Check if this object looks like an AZ store
if (obj.state === 'AZ' || obj.state === 'Arizona' ||
obj.stateCode === 'AZ' || obj.region === 'Arizona' ||
(obj.city && ['Mesa', 'Phoenix', 'Peoria', 'Payson', 'Globe', 'Safford', 'Somerton', 'Prescott Valley'].includes(obj.city))) {
results.push({ path, data: obj });
}
for (const key of Object.keys(obj)) {
results.push(...findArizonaStores(obj[key], `${path}.${key}`));
}
}
return results;
};
const azStores = findArizonaStores(data);
console.log(`Found ${azStores.length} Arizona store objects`);
azStores.forEach(s => {
console.log('\n---');
console.log('Path:', s.path);
console.log(JSON.stringify(s.data, null, 2));
});
}
// Also look for retailer IDs
const retailerMatches = dataStr.match(/"retailerId"\s*:\s*"([a-f0-9-]{36})"/gi);
if (retailerMatches) {
console.log('\n=== RetailerIds in __NEXT_DATA__ ===');
const uniqueIds = [...new Set(retailerMatches.map(m => {
const match = m.match(/([a-f0-9-]{36})/i);
return match ? match[1] : null;
}).filter(Boolean))];
uniqueIds.forEach(id => console.log(id));
}
}
// Try loading a known store URL pattern
const testUrls = [
'https://jarscannabis.com/arizona/',
'https://jarscannabis.com/az/',
'https://jarscannabis.com/stores/arizona/',
'https://jarscannabis.com/locations/arizona/',
'https://jarscannabis.com/shop/arizona/',
'https://az.jarscannabis.com/',
];
console.log('\n=== Testing Arizona URLs ===');
for (const testUrl of testUrls) {
try {
const response = await page.goto(testUrl, { waitUntil: 'domcontentloaded', timeout: 10000 });
const status = response?.status();
console.log(`${testUrl}: ${status}`);
if (status === 200) {
const title = await page.title();
console.log(` Title: ${title}`);
// If we found a working page, extract store links
const storeLinks = await page.evaluate(() => {
return Array.from(document.querySelectorAll('a')).map(a => ({
href: a.href,
text: a.textContent?.trim()
})).filter(l => l.href.includes('shop') || l.href.includes('menu'));
});
if (storeLinks.length > 0) {
console.log(' Store links:');
storeLinks.forEach(l => console.log(` ${l.text}: ${l.href}`));
}
}
} catch (e) {
console.log(`${testUrl}: Error - ${(e as Error).message.substring(0, 50)}`);
}
}
console.log('\n=== Captured Retailer IDs from API calls ===');
const uniqueRetailerIds = [...new Map(capturedRetailerIds.map(r => [r.retailerId, r])).values()];
uniqueRetailerIds.forEach(r => {
console.log(`${r.retailerId} (from: ${r.url.substring(0, 60)}...)`);
});
console.log('\n=== All API calls ===');
allApiCalls.forEach(url => console.log(url.substring(0, 100)));
} finally {
await browser.close();
}
}
findJarsAzStores().catch(e => console.error('Error:', e.message));

View File

@@ -0,0 +1,350 @@
/**
* Platform ID Extractor - Standalone script for extracting Dutchie platform IDs
*
* This script visits dispensary websites to capture their Dutchie retailerId
* by intercepting network requests to the Dutchie GraphQL API.
*
* It does NOT use the main orchestrator - it's a standalone browser-based tool.
*/
import { chromium, Browser, BrowserContext, Page } from 'playwright';
import { Pool } from 'pg';
const pool = new Pool({
connectionString: process.env.DATABASE_URL
});
interface Dispensary {
id: number;
name: string;
website: string;
}
interface ExtractionResult {
id: number;
name: string;
website: string;
platformId: string | null;
source: string | null;
error: string | null;
}
async function extractPlatformId(browser: Browser, dispensary: Dispensary): Promise<ExtractionResult> {
let capturedId: string | null = null;
let captureSource: string | null = null;
let errorMsg: string | null = null;
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
});
const page = await context.newPage();
// Patterns to match retailer IDs in various formats
const idPatterns = [
/["']retailerId["']\s*:\s*["']([a-f0-9]{24})["']/i,
/["']dispensaryId["']\s*:\s*["']([a-f0-9]{24})["']/i,
/retailer["']?\s*:\s*["']([a-f0-9]{24})["']/i,
/dutchie\.com\/embedded-menu\/([a-f0-9]{24})/i,
/dutchie\.com\/dispensary\/([a-f0-9]{24})/i,
/plus\.dutchie\.com\/plus\/([a-f0-9]{24})/i,
/retailerId=([a-f0-9]{24})/i,
/\/([a-f0-9]{24})(?:\/|\?|$)/i, // Generic ID in URL path
];
// Intercept network requests
page.on('request', (request) => {
if (capturedId) return;
const url = request.url();
if (url.includes('dutchie') || url.includes('api.dutchie')) {
// Check URL for retailer ID
for (const pattern of idPatterns) {
const match = url.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'request_url';
break;
}
}
// Check POST data
const postData = request.postData();
if (postData && !capturedId) {
for (const pattern of idPatterns) {
const match = postData.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'request_body';
break;
}
}
}
}
});
try {
console.log(`\n[${dispensary.id}] ${dispensary.name}: ${dispensary.website}`);
// Load main page
await page.goto(dispensary.website, {
waitUntil: 'domcontentloaded',
timeout: 25000
});
await page.waitForTimeout(2000);
// Check page content
if (!capturedId) {
const content = await page.content();
for (const pattern of idPatterns) {
const match = content.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'page_content';
break;
}
}
}
// Check __NEXT_DATA__
if (!capturedId) {
const nextData = await page.evaluate(() => {
const el = document.getElementById('__NEXT_DATA__');
return el?.textContent || null;
});
if (nextData) {
for (const pattern of idPatterns) {
const match = nextData.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = '__NEXT_DATA__';
break;
}
}
}
}
// Check iframes
if (!capturedId) {
const iframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
});
for (const src of iframes) {
if (src.includes('dutchie')) {
const match = src.match(/([a-f0-9]{24})/i);
if (match) {
capturedId = match[1];
captureSource = 'iframe_src';
break;
}
}
}
}
// Check scripts
if (!capturedId) {
const scripts = await page.evaluate(() => {
return Array.from(document.querySelectorAll('script'))
.map(s => s.src || s.innerHTML?.substring(0, 1000))
.filter(Boolean);
});
for (const script of scripts) {
if (script && (script.includes('dutchie') || script.includes('retailerId'))) {
for (const pattern of idPatterns) {
const match = script.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'script';
break;
}
}
if (capturedId) break;
}
}
}
// Try navigating to menu/shop page
if (!capturedId) {
const menuLink = await page.evaluate(() => {
const links = Array.from(document.querySelectorAll('a'));
for (const link of links) {
const href = link.href?.toLowerCase() || '';
const text = link.textContent?.toLowerCase() || '';
if (href.includes('menu') || href.includes('shop') || href.includes('order') ||
text.includes('menu') || text.includes('shop') || text.includes('order')) {
return link.href;
}
}
return null;
});
if (menuLink && !menuLink.startsWith('javascript:')) {
try {
console.log(` -> Following menu link: ${menuLink.substring(0, 60)}...`);
await page.goto(menuLink, { waitUntil: 'domcontentloaded', timeout: 20000 });
await page.waitForTimeout(3000);
// Recheck all sources on new page
const newContent = await page.content();
for (const pattern of idPatterns) {
const match = newContent.match(pattern);
if (match && match[1] && match[1].length === 24) {
capturedId = match[1];
captureSource = 'menu_page_content';
break;
}
}
// Check iframes on new page
if (!capturedId) {
const newIframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
});
for (const src of newIframes) {
if (src.includes('dutchie')) {
const match = src.match(/([a-f0-9]{24})/i);
if (match) {
capturedId = match[1];
captureSource = 'menu_page_iframe';
break;
}
}
}
}
} catch (navError: any) {
// Menu navigation failed, continue
}
}
}
// Final wait for async content
if (!capturedId) {
await page.waitForTimeout(3000);
// Final iframe check
const finalIframes = await page.evaluate(() => {
return Array.from(document.querySelectorAll('iframe')).map(f => f.src);
});
for (const src of finalIframes) {
if (src.includes('dutchie')) {
const match = src.match(/([a-f0-9]{24})/i);
if (match) {
capturedId = match[1];
captureSource = 'delayed_iframe';
break;
}
}
}
}
if (capturedId) {
console.log(` ✓ Found: ${capturedId} (${captureSource})`);
} else {
console.log(` ✗ Not found`);
}
} catch (e: any) {
errorMsg = e.message.substring(0, 100);
console.log(` ✗ Error: ${errorMsg}`);
} finally {
await context.close();
}
return {
id: dispensary.id,
name: dispensary.name,
website: dispensary.website,
platformId: capturedId,
source: captureSource,
error: errorMsg
};
}
async function main() {
// Get specific dispensary ID from command line, or process all missing
const targetId = process.argv[2] ? parseInt(process.argv[2], 10) : null;
let query: string;
let params: any[] = [];
if (targetId) {
query = `
SELECT id, name, website
FROM dispensaries
WHERE id = $1
AND website IS NOT NULL AND website != ''
`;
params = [targetId];
} else {
query = `
SELECT id, name, website
FROM dispensaries
WHERE state = 'AZ'
AND menu_type = 'dutchie'
AND (platform_dispensary_id IS NULL OR platform_dispensary_id = '')
AND website IS NOT NULL AND website != ''
ORDER BY name
`;
}
const result = await pool.query(query, params);
if (result.rows.length === 0) {
console.log('No dispensaries to process');
await pool.end();
return;
}
console.log(`\n=== Platform ID Extractor ===`);
console.log(`Processing ${result.rows.length} dispensaries...\n`);
const browser = await chromium.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const results: ExtractionResult[] = [];
for (const dispensary of result.rows) {
const extractionResult = await extractPlatformId(browser, dispensary);
results.push(extractionResult);
// Update database immediately if found
if (extractionResult.platformId) {
await pool.query(
'UPDATE dispensaries SET platform_dispensary_id = $1 WHERE id = $2',
[extractionResult.platformId, extractionResult.id]
);
}
}
await browser.close();
// Summary
console.log('\n' + '='.repeat(60));
console.log('SUMMARY');
console.log('='.repeat(60));
const found = results.filter(r => r.platformId);
const notFound = results.filter(r => !r.platformId);
console.log(`\nFound: ${found.length}/${results.length}`);
if (found.length > 0) {
console.log('\nSuccessful extractions:');
found.forEach(r => console.log(` [${r.id}] ${r.name} -> ${r.platformId} (${r.source})`));
}
if (notFound.length > 0) {
console.log(`\nNot found: ${notFound.length}`);
notFound.forEach(r => {
const reason = r.error || 'No Dutchie ID detected';
console.log(` [${r.id}] ${r.name}: ${reason}`);
});
}
await pool.end();
}
main().catch(e => {
console.error('Fatal error:', e);
process.exit(1);
});

View File

@@ -0,0 +1,301 @@
/**
* Test script for iHeartJane menu scraping via Playwright
* Intercepts API/Algolia calls made by the browser
*/
import { chromium } from 'playwright';
interface JaneProduct {
id: number;
name: string;
brand?: string;
category?: string;
kind?: string;
kind_subtype?: string;
price?: number;
prices?: Record<string, number>;
thc_potency?: number;
cbd_potency?: number;
image_url?: string;
description?: string;
store_id?: number;
}
async function scrapeJaneMenu(urlOrStoreId: string) {
// Handle either a full URL or just a store ID
const menuUrl = urlOrStoreId.startsWith('http')
? urlOrStoreId
: `https://www.iheartjane.com/embed/stores/${urlOrStoreId}/menu`;
console.log(`Starting Playwright scrape for iHeartJane: ${menuUrl}`);
const browser = await chromium.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-blink-features=AutomationControlled'
]
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/Chicago'
});
// Add stealth scripts to avoid detection
await context.addInitScript(() => {
Object.defineProperty(navigator, 'webdriver', { get: () => false });
(window as any).chrome = { runtime: {} };
});
const page = await context.newPage();
const products: JaneProduct[] = [];
const apiResponses: any[] = [];
const capturedCredentials: any = {};
// Intercept ALL network requests to capture API/Algolia data and credentials
page.on('request', (request) => {
const url = request.url();
const headers = request.headers();
// Capture Algolia credentials from request headers
if (url.includes('algolia')) {
const appId = headers['x-algolia-application-id'];
const apiKey = headers['x-algolia-api-key'];
if (appId && apiKey) {
capturedCredentials.algolia = { appId, apiKey };
console.log(`Captured Algolia credentials: App=${appId}, Key=${apiKey.substring(0, 10)}...`);
}
}
});
page.on('response', async (response) => {
const url = response.url();
// Capture Algolia search results
if (url.includes('algolia.net') || url.includes('algolianet.com')) {
try {
const data = await response.json();
if (data.results && data.results[0] && data.results[0].hits) {
console.log(`Captured ${data.results[0].hits.length} products from Algolia`);
apiResponses.push({ type: 'algolia', data: data.results[0] });
}
} catch (e) {
// Not JSON or error parsing
}
}
// Capture Jane API responses
if (url.includes('api.iheartjane.com') && url.includes('products')) {
try {
const data = await response.json();
console.log(`Captured Jane API response: ${url}`);
apiResponses.push({ type: 'jane-api', url, data });
} catch (e) {
// Not JSON or error parsing
}
}
});
try {
console.log(`Navigating to: ${menuUrl}`);
await page.goto(menuUrl, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
// Wait for page to settle
await page.waitForTimeout(2000);
// Handle age gate - use Playwright locator with force click
console.log('Looking for age gate...');
try {
let clicked = false;
// Method 1: Use Playwright locator with exact text match
try {
const yesButton = page.locator('button:has-text("Yes")').first();
await yesButton.waitFor({ state: 'visible', timeout: 5000 });
await yesButton.click({ force: true });
clicked = true;
console.log('Clicked age gate via Playwright locator');
await page.waitForTimeout(5000);
} catch (e) {
console.log('Playwright locator failed:', (e as Error).message);
}
// Method 2: Try clicking by visible bounding box
if (!clicked) {
try {
const box = await page.locator('button:has-text("Yes")').first().boundingBox();
if (box) {
await page.mouse.click(box.x + box.width / 2, box.y + box.height / 2);
clicked = true;
console.log(`Clicked age gate at coordinates: ${box.x + box.width / 2}, ${box.y + box.height / 2}`);
await page.waitForTimeout(5000);
}
} catch (e) {
console.log('Bounding box click failed');
}
}
// Method 3: Try JavaScript click
if (!clicked) {
const jsClickResult = await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
for (const btn of buttons) {
if (btn.textContent?.includes('Yes')) {
btn.click();
return { success: true, buttonText: btn.textContent };
}
}
return { success: false };
});
if (jsClickResult.success) {
clicked = true;
console.log(`Clicked via JS: ${jsClickResult.buttonText}`);
await page.waitForTimeout(5000);
}
}
// Method 4: Click element containing "Yes" with dispatchEvent
if (!clicked) {
const dispatchResult = await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
for (const btn of buttons) {
if (btn.textContent?.includes('Yes')) {
btn.dispatchEvent(new MouseEvent('click', { bubbles: true, cancelable: true }));
return true;
}
}
return false;
});
if (dispatchResult) {
clicked = true;
console.log('Clicked via dispatchEvent');
await page.waitForTimeout(5000);
}
}
// Log button info for debugging
const buttonInfo = await page.evaluate(() => {
const buttons = Array.from(document.querySelectorAll('button'));
return buttons.map(b => ({
text: b.textContent?.trim(),
visible: b.offsetParent !== null,
rect: b.getBoundingClientRect()
}));
});
console.log('Buttons found:', JSON.stringify(buttonInfo, null, 2));
} catch (e) {
console.log('Age gate handling error:', e);
}
// Wait for content to load after age gate
await page.waitForTimeout(3000);
// Try to scroll to trigger more product loads
console.log('Scrolling to load more products...');
for (let i = 0; i < 3; i++) {
await page.evaluate(() => window.scrollBy(0, 1000));
await page.waitForTimeout(1000);
}
// Extract products from the page DOM as backup
const domProducts = await page.evaluate(() => {
const items: any[] = [];
// Try various selectors that Jane might use
const productCards = document.querySelectorAll('[data-testid*="product"], [class*="ProductCard"], [class*="product-card"], .product-tile');
productCards.forEach((card) => {
const name = card.querySelector('[class*="name"], [class*="title"], h3, h4')?.textContent?.trim();
const brand = card.querySelector('[class*="brand"]')?.textContent?.trim();
const price = card.querySelector('[class*="price"]')?.textContent?.trim();
const image = card.querySelector('img')?.getAttribute('src');
if (name) {
items.push({ name, brand, price, image, source: 'dom' });
}
});
return items;
});
console.log(`Extracted ${domProducts.length} products from DOM`);
// Check for __NEXT_DATA__ or similar embedded data
const embeddedData = await page.evaluate(() => {
// Check for Next.js data
const nextData = document.getElementById('__NEXT_DATA__');
if (nextData) {
return { type: 'next', data: JSON.parse(nextData.textContent || '{}') };
}
// Check for any window-level product data
const win = window as any;
if (win.__INITIAL_STATE__) return { type: 'initial_state', data: win.__INITIAL_STATE__ };
if (win.__PRELOADED_STATE__) return { type: 'preloaded', data: win.__PRELOADED_STATE__ };
if (win.products) return { type: 'products', data: win.products };
return null;
});
if (embeddedData) {
console.log(`Found embedded data: ${embeddedData.type}`);
apiResponses.push(embeddedData);
}
// Take a screenshot for debugging
const screenshotPath = `/tmp/jane-scrape-${Date.now()}.png`;
await page.screenshot({ path: screenshotPath, fullPage: true });
console.log(`Screenshot saved to ${screenshotPath}`);
// Process captured API responses
console.log('\n=== API Responses Summary ===');
for (const resp of apiResponses) {
console.log(`Type: ${resp.type}`);
if (resp.type === 'algolia' && resp.data.hits) {
console.log(` Hits: ${resp.data.hits.length}`);
console.log(` Total: ${resp.data.nbHits}`);
if (resp.data.hits[0]) {
console.log(` Sample product:`, JSON.stringify(resp.data.hits[0], null, 2).substring(0, 1000));
}
}
}
console.log('\n=== DOM Products Sample ===');
console.log(JSON.stringify(domProducts.slice(0, 3), null, 2));
console.log('\n=== Captured Credentials ===');
console.log(JSON.stringify(capturedCredentials, null, 2));
return {
apiResponses,
domProducts,
embeddedData,
capturedCredentials
};
} finally {
await browser.close();
}
}
// Main execution
const urlOrStoreId = process.argv[2] || 'https://iheartjane.com/aly2djS2yXoTGnR0/DBeqE6HSSwijog9l'; // Default to The Flower Shop Az
scrapeJaneMenu(urlOrStoreId)
.then((result) => {
console.log('\n=== Scrape Complete ===');
console.log(`Total API responses captured: ${result.apiResponses.length}`);
console.log(`Total DOM products: ${result.domProducts.length}`);
})
.catch((err) => {
console.error('Scrape failed:', err);
process.exit(1);
});