Remove incorrect migration 029, add snapshot architecture, improve scraper

- Delete migration 029 that was incorrectly creating duplicate dispensaries
- Add migration 028 for snapshot architecture
- Improve downloader with proxy/UA rotation
- Update scraper monitor and tools pages
- Various scraper improvements

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Kelly
2025-12-01 08:52:54 -07:00
parent e5b88b093c
commit 199b6a8a23
12 changed files with 760 additions and 341 deletions

View File

@@ -93,7 +93,7 @@ export function ScraperMonitor() {
marginBottom: '-2px'
}}
>
Brand Scrape Jobs
Dispensary Jobs
</button>
<button
onClick={() => setActiveTab('scrapers')}
@@ -109,7 +109,7 @@ export function ScraperMonitor() {
marginBottom: '-2px'
}}
>
Legacy Scrapers
Crawl History
</button>
</div>
@@ -232,10 +232,10 @@ export function ScraperMonitor() {
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'start' }}>
<div style={{ flex: 1 }}>
<div style={{ fontSize: '18px', fontWeight: '600', marginBottom: '8px' }}>
{job.brand_name}
{job.dispensary_name || job.brand_name}
</div>
<div style={{ fontSize: '14px', color: '#666', marginBottom: '12px' }}>
Worker: {job.worker_id} | Job #{job.id}
{job.job_type || 'crawl'} | Job #{job.id}
</div>
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(150px, 1fr))', gap: '12px' }}>
<div>
@@ -290,8 +290,8 @@ export function ScraperMonitor() {
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
<thead>
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Brand</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Worker</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Type</th>
<th style={{ padding: '15px', textAlign: 'center', fontWeight: '600' }}>Status</th>
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Found</th>
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Saved</th>
@@ -302,8 +302,8 @@ export function ScraperMonitor() {
<tbody>
{recentJobs.map((job: any) => (
<tr key={job.id} style={{ borderBottom: '1px solid #eee' }}>
<td style={{ padding: '15px' }}>{job.brand_name}</td>
<td style={{ padding: '15px', fontSize: '14px', color: '#666' }}>{job.worker_id || '-'}</td>
<td style={{ padding: '15px' }}>{job.dispensary_name || job.brand_name}</td>
<td style={{ padding: '15px', fontSize: '14px', color: '#666' }}>{job.job_type || '-'}</td>
<td style={{ padding: '15px', textAlign: 'center' }}>
<span style={{
padding: '4px 10px',
@@ -481,22 +481,37 @@ export function ScraperMonitor() {
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
<thead>
<tr style={{ background: '#f8f8f8', borderBottom: '2px solid #eee' }}>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Store</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Category</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Dispensary</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Status</th>
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Found</th>
<th style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>Products</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Scraped</th>
<th style={{ padding: '15px', textAlign: 'left', fontWeight: '600' }}>Last Crawled</th>
</tr>
</thead>
<tbody>
{history.map((item, index) => (
<tr key={index} style={{ borderBottom: '1px solid #eee' }}>
<td style={{ padding: '15px' }}>{item.store_name}</td>
<td style={{ padding: '15px' }}>{item.category_name}</td>
<td style={{ padding: '15px' }}>{item.dispensary_name || item.store_name}</td>
<td style={{ padding: '15px' }}>
<span style={{
padding: '4px 10px',
borderRadius: '12px',
fontSize: '12px',
fontWeight: '600',
background: item.status === 'completed' ? '#d1fae5' : item.status === 'failed' ? '#fee2e2' : '#fef3c7',
color: item.status === 'completed' ? '#065f46' : item.status === 'failed' ? '#991b1b' : '#92400e'
}}>
{item.status || '-'}
</span>
</td>
<td style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>
{item.products_found || '-'}
</td>
<td style={{ padding: '15px', textAlign: 'right', fontWeight: '600' }}>
{item.product_count}
</td>
<td style={{ padding: '15px', color: '#666' }}>
{new Date(item.last_scraped_at).toLocaleString()}
{item.last_scraped_at ? new Date(item.last_scraped_at).toLocaleString() : '-'}
</td>
</tr>
))}

View File

@@ -17,61 +17,61 @@ const USER_AGENTS = {
};
export function ScraperTools() {
const [stores, setStores] = useState<any[]>([]);
const [selectedStore, setSelectedStore] = useState<number | null>(null);
const [dispensaries, setDispensaries] = useState<any[]>([]);
const [selectedDispensary, setSelectedDispensary] = useState<number | null>(null);
const [parallelScrapers, setParallelScrapers] = useState(3);
const [selectedUserAgent, setSelectedUserAgent] = useState<string>('rotate-desktop');
const [scraping, setScraping] = useState(false);
const [downloadingImages, setDownloadingImages] = useState(false);
const [discoveringCategories, setDiscoveringCategories] = useState(false);
const [debugging, setDebugging] = useState(false);
const [notification, setNotification] = useState<{ message: string; type: 'success' | 'error' | 'info' } | null>(null);
const [loading, setLoading] = useState(true);
useEffect(() => {
loadStores();
loadDispensaries();
}, []);
const loadStores = async () => {
const loadDispensaries = async () => {
setLoading(true);
try {
const data = await api.getStores();
setStores(data.stores);
if (data.stores.length > 0) {
setSelectedStore(data.stores[0].id);
const data = await api.getDispensaries();
// Filter to dispensaries that have a menu_url and are scrape enabled
const scrapableDispensaries = data.dispensaries.filter((d: any) => d.menu_url && d.scrape_enabled);
setDispensaries(scrapableDispensaries);
if (scrapableDispensaries.length > 0) {
setSelectedDispensary(scrapableDispensaries[0].id);
}
} catch (error) {
console.error('Failed to load stores:', error);
console.error('Failed to load dispensaries:', error);
} finally {
setLoading(false);
}
};
const handleScrape = async () => {
if (!selectedStore || scraping) return;
if (!selectedDispensary || scraping) return;
setScraping(true);
try {
await api.scrapeStore(selectedStore, parallelScrapers, selectedUserAgent || undefined);
await api.triggerDispensaryCrawl(selectedDispensary);
setNotification({
message: `Scrape started with ${parallelScrapers} parallel scrapers using ${USER_AGENTS[selectedUserAgent as keyof typeof USER_AGENTS] || 'Random'} UA! Check the Scraper Monitor for progress.`,
message: `Crawl started for dispensary! Check the Scraper Monitor for progress.`,
type: 'success'
});
} catch (error: any) {
setNotification({ message: 'Failed to start scrape: ' + error.message, type: 'error' });
setNotification({ message: 'Failed to start crawl: ' + error.message, type: 'error' });
} finally {
setScraping(false);
}
};
const handleDownloadImages = async () => {
if (!selectedStore || downloadingImages) return;
if (!selectedDispensary || downloadingImages) return;
setDownloadingImages(true);
try {
const result = await api.downloadStoreImages(selectedStore);
// TODO: Implement dispensary image download endpoint
setNotification({
message: `Image download started! ${result.total_missing} missing images will be downloaded.`,
message: `Image download feature coming soon!`,
type: 'info'
});
} catch (error: any) {
@@ -81,35 +81,7 @@ export function ScraperTools() {
}
};
const handleDiscoverCategories = async () => {
if (!selectedStore || discoveringCategories) return;
setDiscoveringCategories(true);
try {
await api.discoverStoreCategories(selectedStore);
setNotification({ message: 'Category discovery started! Check logs for progress.', type: 'info' });
} catch (error: any) {
setNotification({ message: 'Failed to start category discovery: ' + error.message, type: 'error' });
} finally {
setDiscoveringCategories(false);
}
};
const handleDebug = async () => {
if (!selectedStore || debugging) return;
setDebugging(true);
try {
await api.debugScrapeStore(selectedStore);
setNotification({ message: 'Debug started! Check Logs page for output.', type: 'info' });
} catch (error: any) {
setNotification({ message: 'Debug failed: ' + error.message, type: 'error' });
} finally {
setDebugging(false);
}
};
const selectedStoreData = stores.find(s => s.id === selectedStore);
const selectedDispensaryData = dispensaries.find(d => d.id === selectedDispensary);
if (loading) {
return (
@@ -133,32 +105,32 @@ export function ScraperTools() {
<div className="space-y-6">
<div>
<h1 className="text-3xl font-bold">Scraper Tools</h1>
<p className="text-gray-500 mt-2">Manage scraping operations for your stores</p>
<p className="text-gray-500 mt-2">Manage crawling operations for dispensaries</p>
</div>
{/* Store Selection */}
{/* Dispensary Selection */}
<div className="card bg-base-100 shadow-xl">
<div className="card-body">
<h2 className="card-title">Select Store</h2>
<h2 className="card-title">Select Dispensary</h2>
<select
className="select select-bordered w-full max-w-md"
value={selectedStore || ''}
onChange={(e) => setSelectedStore(parseInt(e.target.value))}
value={selectedDispensary || ''}
onChange={(e) => setSelectedDispensary(parseInt(e.target.value))}
>
{stores.map(store => (
<option key={store.id} value={store.id}>
{store.name} ({store.product_count || 0} products)
{dispensaries.map(disp => (
<option key={disp.id} value={disp.id}>
{disp.dba_name || disp.name} - {disp.city}, {disp.state}
</option>
))}
</select>
{selectedStoreData && (
{selectedDispensaryData && (
<div className="mt-4 p-4 bg-base-200 rounded-lg">
<div className="grid grid-cols-2 md:grid-cols-4 gap-4 text-sm">
<div>
<div className="text-gray-500">Status</div>
<div className="font-semibold">
{selectedStoreData.scrape_enabled ? (
{selectedDispensaryData.scrape_enabled ? (
<span className="badge badge-success">Enabled</span>
) : (
<span className="badge badge-error">Disabled</span>
@@ -166,18 +138,18 @@ export function ScraperTools() {
</div>
</div>
<div>
<div className="text-gray-500">Categories</div>
<div className="font-semibold">{selectedStoreData.category_count || 0}</div>
<div className="text-gray-500">Provider</div>
<div className="font-semibold">{selectedDispensaryData.provider_type || 'Unknown'}</div>
</div>
<div>
<div className="text-gray-500">Products</div>
<div className="font-semibold">{selectedStoreData.product_count || 0}</div>
<div className="font-semibold">{selectedDispensaryData.product_count || 0}</div>
</div>
<div>
<div className="text-gray-500">Last Scraped</div>
<div className="text-gray-500">Last Crawled</div>
<div className="font-semibold">
{selectedStoreData.last_scraped_at
? new Date(selectedStoreData.last_scraped_at).toLocaleDateString()
{selectedDispensaryData.last_crawl_at
? new Date(selectedDispensaryData.last_crawl_at).toLocaleDateString()
: 'Never'}
</div>
</div>
@@ -189,56 +161,21 @@ export function ScraperTools() {
{/* Scraper Actions */}
<div className="grid grid-cols-1 md:grid-cols-2 gap-6">
{/* Scrape Now */}
{/* Crawl Now */}
<div className="card bg-base-100 shadow-xl">
<div className="card-body">
<h2 className="card-title">Scrape Store</h2>
<h2 className="card-title">Crawl Dispensary</h2>
<p className="text-sm text-gray-500">
Start scraping products from the selected store
Start crawling products from the selected dispensary menu
</p>
<div className="form-control w-full mt-4">
<label className="label">
<span className="label-text">Parallel Scrapers</span>
</label>
<input
type="number"
min="1"
max="10"
value={parallelScrapers}
onChange={(e) => setParallelScrapers(parseInt(e.target.value) || 3)}
className="input input-bordered w-full"
/>
<label className="label">
<span className="label-text-alt">Number of concurrent scraping processes (1-10)</span>
</label>
</div>
<div className="form-control w-full mt-4">
<label className="label">
<span className="label-text">User Agent</span>
</label>
<select
className="select select-bordered w-full"
value={selectedUserAgent}
onChange={(e) => setSelectedUserAgent(e.target.value)}
>
{Object.entries(USER_AGENTS).map(([key, label]) => (
<option key={key} value={key}>{label}</option>
))}
</select>
<label className="label">
<span className="label-text-alt">Browser/bot identity for scraping session</span>
</label>
</div>
<div className="card-actions justify-end mt-4">
<button
onClick={handleScrape}
disabled={!selectedStore || scraping}
disabled={!selectedDispensary || scraping}
className={`btn btn-primary ${scraping ? 'loading' : ''}`}
>
{scraping ? 'Scraping...' : 'Start Scrape'}
{scraping ? 'Starting...' : 'Start Crawl'}
</button>
</div>
</div>
@@ -249,13 +186,13 @@ export function ScraperTools() {
<div className="card-body">
<h2 className="card-title">Download Images</h2>
<p className="text-sm text-gray-500">
Download missing product images for the selected store
Download missing product images for the selected dispensary
</p>
<div className="card-actions justify-end mt-auto">
<button
onClick={handleDownloadImages}
disabled={!selectedStore || downloadingImages}
disabled={!selectedDispensary || downloadingImages}
className={`btn btn-secondary ${downloadingImages ? 'loading' : ''}`}
>
{downloadingImages ? 'Downloading...' : 'Download Missing Images'}
@@ -263,46 +200,6 @@ export function ScraperTools() {
</div>
</div>
</div>
{/* Discover Categories */}
<div className="card bg-base-100 shadow-xl">
<div className="card-body">
<h2 className="card-title">Discover Categories</h2>
<p className="text-sm text-gray-500">
Automatically discover and create categories from the store
</p>
<div className="card-actions justify-end mt-auto">
<button
onClick={handleDiscoverCategories}
disabled={!selectedStore || discoveringCategories}
className={`btn btn-accent ${discoveringCategories ? 'loading' : ''}`}
>
{discoveringCategories ? 'Discovering...' : 'Discover Categories'}
</button>
</div>
</div>
</div>
{/* Debug Scraper */}
<div className="card bg-base-100 shadow-xl">
<div className="card-body">
<h2 className="card-title">Debug Scraper</h2>
<p className="text-sm text-gray-500">
Run scraper in debug mode and view detailed logs
</p>
<div className="card-actions justify-end mt-auto">
<button
onClick={handleDebug}
disabled={!selectedStore || debugging}
className={`btn btn-warning ${debugging ? 'loading' : ''}`}
>
{debugging ? 'Debugging...' : 'Start Debug'}
</button>
</div>
</div>
</div>
</div>
{/* Quick Links */}