Files
cannaiq/backend/archive/populate-jobs.ts
Kelly d91c55a344 feat: Add stale process monitor, users route, landing page, archive old scripts
- Add backend stale process monitoring API (/api/stale-processes)
- Add users management route
- Add frontend landing page and stale process monitor UI on /scraper-tools
- Move old development scripts to backend/archive/
- Update frontend build with new features

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 04:07:31 -07:00

182 lines
5.3 KiB
TypeScript

import { firefox } from 'playwright';
import { pool } from './src/db/migrate.js';
import { getRandomProxy } from './src/utils/proxyManager.js';
const dispensaryId = parseInt(process.argv[2] || '112', 10);
interface Brand {
slug: string;
name: string;
url: string;
}
async function scrapeBrandsList(menuUrl: string, context: any, page: any): Promise<Brand[]> {
try {
const brandsUrl = `${menuUrl}/brands`;
console.log(`📄 Loading brands page: ${brandsUrl}`);
await page.goto(brandsUrl, {
waitUntil: 'domcontentloaded',
timeout: 60000
});
console.log(`⏳ Waiting for brands to render...`);
await page.waitForSelector('a[href*="/brands/"]', { timeout: 45000 });
console.log(`✅ Brands appeared!`);
await page.waitForTimeout(3000);
// Scroll to load all brands
console.log(`📜 Scrolling to load all brands...`);
let previousHeight = 0;
let scrollAttempts = 0;
const maxScrolls = 10;
while (scrollAttempts < maxScrolls) {
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
await page.waitForTimeout(1500);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) break;
previousHeight = currentHeight;
scrollAttempts++;
}
// Extract all brand links
const brands = await page.evaluate(() => {
const brandLinks = Array.from(document.querySelectorAll('a[href*="/brands/"]'));
const extracted = brandLinks.map(link => {
const href = link.getAttribute('href') || '';
const slug = href.split('/brands/')[1]?.replace(/\/$/, '') || '';
// Get brand name from the ContentWrapper div to avoid placeholder letter duplication
const contentWrapper = link.querySelector('[class*="ContentWrapper"]');
const name = contentWrapper?.textContent?.trim() || link.textContent?.trim() || slug;
return {
slug,
name,
url: href.startsWith('http') ? href : href
};
});
// Filter out duplicates and invalid entries
const seen = new Set();
const unique = extracted.filter(b => {
if (!b.slug || !b.name || seen.has(b.slug)) return false;
seen.add(b.slug);
return true;
});
return unique;
});
console.log(`✅ Found ${brands.length} total brands`);
return brands;
} catch (error: any) {
console.error(`❌ Error scraping brands list:`, error.message);
return [];
}
}
async function main() {
console.log(`\n${'='.repeat(60)}`);
console.log(`🏭 POPULATING JOB QUEUE`);
console.log(` Dispensary ID: ${dispensaryId}`);
console.log(`${'='.repeat(60)}\n`);
// Get dispensary info
const dispensaryResult = await pool.query(
"SELECT id, name, menu_url FROM dispensaries WHERE id = $1",
[dispensaryId]
);
if (dispensaryResult.rows.length === 0) {
console.error(`❌ Dispensary ID ${dispensaryId} not found`);
process.exit(1);
}
const menuUrl = dispensaryResult.rows[0].menu_url;
console.log(`✅ Dispensary: ${dispensaryResult.rows[0].name}`);
console.log(` Menu URL: ${menuUrl}\n`);
// Get proxy
const proxy = await getRandomProxy();
if (!proxy) {
console.log(`❌ No proxy available`);
process.exit(1);
}
console.log(`🔐 Using proxy: ${proxy.server}\n`);
// Launch browser
const browser = await firefox.launch({ headless: true });
const context = await browser.newContext({
viewport: { width: 1920, height: 1080 },
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
proxy: {
server: proxy.server,
username: proxy.username,
password: proxy.password
}
});
const page = await context.newPage();
// Get all brands
const allBrands = await scrapeBrandsList(menuUrl, context, page);
if (allBrands.length === 0) {
console.log(`❌ No brands found`);
await browser.close();
process.exit(1);
}
console.log(`\n📋 Found ${allBrands.length} brands. Populating job queue...\n`);
// Insert jobs into database
let inserted = 0;
let skipped = 0;
for (const brand of allBrands) {
try {
await pool.query(`
INSERT INTO brand_scrape_jobs (dispensary_id, brand_slug, brand_name, status)
VALUES ($1, $2, $3, 'pending')
ON CONFLICT (dispensary_id, brand_slug) DO NOTHING
`, [dispensaryId, brand.slug, brand.name]);
const result = await pool.query(
'SELECT id FROM brand_scrape_jobs WHERE dispensary_id = $1 AND brand_slug = $2',
[dispensaryId, brand.slug]
);
if (result.rows.length > 0) {
inserted++;
if (inserted % 10 === 0) {
console.log(` Inserted ${inserted}/${allBrands.length} jobs...`);
}
} else {
skipped++;
}
} catch (error: any) {
console.error(`❌ Error inserting job for ${brand.name}:`, error.message);
}
}
console.log(`\n${'='.repeat(60)}`);
console.log(`✅ JOB QUEUE POPULATED`);
console.log(` Total brands: ${allBrands.length}`);
console.log(` Jobs inserted: ${inserted}`);
console.log(` Jobs skipped (already exist): ${skipped}`);
console.log(`${'='.repeat(60)}\n`);
await browser.close();
await pool.end();
}
main().catch(console.error);