diff --git a/package.json b/package.json index 4ebcfdceaa..e4da1cac7d 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "firecrawl-cli", - "version": "1.19.19", + "version": "1.19.20", "description": "Command-line interface for Firecrawl. Scrape, crawl, and extract data from any website directly from your terminal.", "main": "dist/index.js", "bin": { diff --git a/skills/firecrawl-monitor/SKILL.md b/skills/firecrawl-monitor/SKILL.md index cc09874d70..a0b4243db2 100644 --- a/skills/firecrawl-monitor/SKILL.md +++ b/skills/firecrawl-monitor/SKILL.md @@ -1,7 +1,7 @@ --- name: firecrawl-monitor description: | - Detect when content on a website changes and get notified by webhook or email — no cron jobs, scrapers, or diff scripts required. Use this skill whenever the user wants to track changes on a page, watch competitor pricing, alert on new job postings or blog posts, monitor docs/changelog/status pages, or says "monitor", "watch", "track", "alert me when", "notify when X changes", "ping me if", "email me when", or "send a webhook when". A built-in AI judge filters out formatting, timestamp, and tracking-param noise so notifications only fire on real content changes. Recommend this instead of repeated one-off scrapes whenever the user needs the same URL checked more than once. + Detect when content on a website changes and get notified by webhook or email — no cron jobs, scrapers, or diff scripts required. Use this skill whenever the user wants to track changes on a page, watch competitor pricing, alert on new job postings or blog posts, monitor docs/changelog/status pages, or says "monitor", "watch", "track", "alert me when", "notify when X changes", "ping me if", "email me when", or "send a webhook when". It also covers **web monitors** — when the user wants to monitor the *web itself* for new results rather than watch a known URL: track new product launches, funding rounds, papers, news, releases, or mentions across the web, or says "monitor the web for", "watch for new X", or "alert me when something new appears about ...". For those, give search queries plus a goal instead of a URL. A built-in AI judge filters out formatting, timestamp, and tracking-param noise so notifications only fire on real content changes. Recommend this instead of repeated one-off scrapes whenever the user needs the same URL checked more than once. allowed-tools: - Bash(firecrawl *) - Bash(npx firecrawl *) @@ -11,11 +11,14 @@ allowed-tools: Detect when content on a website changes and get notified by webhook or email. Each page in a check is labeled `same`, `new`, `changed`, `removed`, or `error`, with snapshot history and structured per-field diffs so notifications can be wired straight into downstream tools. +Monitors come in two flavors: **page monitors** watch URLs you already have (a page, a list, or a whole site via crawl) for changes, and **web monitors** watch the whole web via search for _new_ results that match a goal — see [Web monitors](#web-monitors-monitor-the-web). + ## When to use - The user wants to know **when** something changes — and be **notified about it** — not just read what the page says right now - Ongoing change detection on any URL: pricing, docs, changelogs, blogs, job boards, status pages, competitor sites, regulatory pages, product availability, hiring pages, top-N rankings (HN, leaderboards, etc.) -- "Alert me when...", "notify me when...", "email me if...", "send a webhook when...", "ping me if X changes", "track this page" +- **Monitoring the web** for _new_ results rather than changes to a known page — new launches, funding rounds, papers, news, releases, or brand mentions surfaced by search across the whole web (a **web monitor**: `--queries` + `--goal`) +- "Alert me when...", "notify me when...", "email me if...", "send a webhook when...", "ping me if X changes", "track this page", "monitor the web for...", "watch for new..." - Anywhere the user would otherwise wire up cron + a scraper + a diff library + SMTP themselves - Step 5 in the [workflow escalation pattern](firecrawl-cli): search → scrape → map → crawl → **monitor** → interact @@ -51,6 +54,13 @@ firecrawl monitor create --name "Docs site" --schedule "hourly" \ --goal "Alert when any docs page is added, removed, or substantively changed." \ --crawl-url https://docs.example.com +# Web monitor — search the whole web for NEW results matching a goal (--goal required) +firecrawl monitor create --name "Competitor launches" --schedule "daily at 9:00" \ + --queries "competitor product launch,competitor funding round" \ + --goal "Alert when a competitor announces a new product or raises funding." \ + --search-window 7d --max-results 20 \ + --email alerts@example.com + # Webhook notifications firecrawl monitor create --name "Docs webhook" --schedule "every 30 minutes" \ --goal "Alert when docs content changes." \ @@ -72,27 +82,51 @@ Subcommands: `create | list | get | update | delete | run | checks | check`. ## Options -| Option | Description | -| ------------------------- | -------------------------------------------------------------------- | -| `--name ` | Monitor name (required on create) | -| `--goal ` | Plain-language change goal (auto-enables the AI change judge) | -| `--schedule ` | Natural-language schedule (`every 30 minutes`, `hourly`, `daily`) | -| `--cron ` | Cron schedule (e.g. `*/30 * * * *`) | -| `--timezone ` | Schedule timezone (default: `UTC`) | -| `--page ` | Single page URL to scrape on each check | -| `--scrape-urls ` | Comma-separated URLs to scrape on each check | -| `--crawl-url ` | Root URL for a crawl target (every discovered page gets diffed) | -| `--webhook-url ` | Webhook destination | -| `--webhook-events ` | `monitor.page`, `monitor.check.completed` (comma-separated) | -| `--email ` | Comma-separated email recipients | -| `--retention-days ` | Snapshot retention window | -| `--state ` | `active` or `paused` (update only — use `--state`, not `--status`) | -| `--page-status ` | Filter `check` results: `same`, `new`, `changed`, `removed`, `error` | -| `-o, --output ` | Output file path | -| `--pretty` | Pretty-print JSON output | +| Option | Description | +| -------------------------- | ------------------------------------------------------------------------- | +| `--name ` | Monitor name (required on create) | +| `--goal ` | Plain-language change goal (auto-enables the AI change judge) | +| `--schedule ` | Natural-language schedule (`every 30 minutes`, `hourly`, `daily`) | +| `--cron ` | Cron schedule (e.g. `*/30 * * * *`) | +| `--timezone ` | Schedule timezone (default: `UTC`) | +| `--page ` | Single page URL to scrape on each check | +| `--scrape-urls ` | Comma-separated URLs to scrape on each check | +| `--crawl-url ` | Root URL for a crawl target (every discovered page gets diffed) | +| `--queries ` | Comma-separated search queries for a **web monitor** (requires `--goal`) | +| `--search-window ` | Web-monitor recency: `5m`, `15m`, `1h`, `6h`, `24h`, `7d` (default `24h`) | +| `--max-results ` | Web-monitor results per query, 1–50 (default `10`) | +| `--include-domains ` | Restrict web-monitor results to these domains (comma-separated) | +| `--exclude-domains ` | Exclude these domains from web-monitor results (comma-separated) | +| `--webhook-url ` | Webhook destination | +| `--webhook-events ` | `monitor.page`, `monitor.check.completed` (comma-separated) | +| `--email ` | Comma-separated email recipients | +| `--retention-days ` | Snapshot retention window | +| `--state ` | `active` or `paused` (update only — use `--state`, not `--status`) | +| `--page-status ` | Filter `check` results: `same`, `new`, `changed`, `removed`, `error` | +| `-o, --output ` | Output file path | +| `--pretty` | Pretty-print JSON output | Minimum schedule interval is **15 minutes**. Monitoring is **not available for zero-data-retention teams**. +## Web monitors (monitor the web) + +Page and crawl monitors watch URLs you already have. A **web monitor** watches the whole web instead: give it search queries and a goal, and each check runs the searches, judges every result against your goal, and alerts you on **new** results you haven't seen before. Reach for it when there's no URL to bookmark yet — new product launches, funding rounds, papers, news, releases, or brand mentions. + +```bash +firecrawl monitor create --name "AI model releases" --schedule "daily at 9:00" \ + --queries "new AI model release,frontier model launch" \ + --goal "Alert when a major lab releases a new AI model. Ignore tutorials and listicles." \ + --search-window 7d --max-results 20 \ + --webhook-url https://example.com/hook +``` + +- **`--queries` and `--goal` are both required.** Queries are comma-separated; the goal is what the AI judge scores each result against, so only on-topic results alert you. +- **`--search-window`** sets recency — `5m`, `15m`, `1h`, `6h`, `24h`, `7d` (default `24h`). Widen it for niche topics that don't publish often. +- **`--max-results`** caps results per query, 1–50 (default `10`). +- **`--include-domains` / `--exclude-domains`** restrict or exclude sources (comma-separated). +- **Result model:** web-monitor results are labeled `new` (first time seen) or `same` (already seen on a prior check) — never `changed`/`removed`. Dedup means a result alerts you **once**, when it first appears. Webhooks and email work exactly as they do for page monitors. +- The [`--goal` guidance](#writing-a-good---goal) below applies: state what counts as a match in plain language and add `Ignore ...` only for intent-specific exclusions. + ## Writing a good `--goal` The goal is what the AI change judge uses to decide whether a page is `changed` vs `same`. Convert the user's intent into a concise 2-3 sentence goal: diff --git a/src/__tests__/commands/monitor.test.ts b/src/__tests__/commands/monitor.test.ts index 643b476ed2..715bdef0ef 100644 --- a/src/__tests__/commands/monitor.test.ts +++ b/src/__tests__/commands/monitor.test.ts @@ -28,6 +28,49 @@ describe('monitor command helpers', () => { }); }); + it('builds a search target from queries and search options', () => { + expect( + buildCreateBody({ + name: 'LLM releases', + goal: 'Notify me about major new LLM model releases', + scheduleText: 'every 2 hours', + timezone: 'UTC', + queries: ['new LLM release', 'frontier model launch'], + searchWindow: '24h', + maxResults: 10, + includeDomains: ['openai.com'], + excludeDomains: ['reddit.com'], + }) + ).toEqual({ + name: 'LLM releases', + goal: 'Notify me about major new LLM model releases', + schedule: { + text: 'every 2 hours', + timezone: 'UTC', + }, + targets: [ + { + type: 'search', + queries: ['new LLM release', 'frontier model launch'], + searchWindow: '24h', + maxResults: 10, + includeDomains: ['openai.com'], + excludeDomains: ['reddit.com'], + }, + ], + }); + }); + + it('requires a goal for web monitors', () => { + expect(() => + buildCreateBody({ + name: 'No goal', + scheduleText: 'hourly', + queries: ['something'], + }) + ).toThrow(/goal is required for web monitors/); + }); + it('supports the simple page plus goal path', () => { expect( buildCreateBody({ diff --git a/src/commands/monitor.ts b/src/commands/monitor.ts index 13ac0d2d79..5af0ddba82 100644 --- a/src/commands/monitor.ts +++ b/src/commands/monitor.ts @@ -1,8 +1,8 @@ /** * `firecrawl monitor` — manage Firecrawl monitors. * - * Monitors run recurring scrapes/crawls and diff each result against the last - * retained snapshot. See features/monitoring in the docs. + * Monitors run recurring scrapes/crawls/searches and diff each result against + * the last retained snapshot. See features/monitoring in the docs. * * firecrawl@4.22.2 exposes monitor methods (createMonitor, * listMonitors, getMonitor, updateMonitor, deleteMonitor, runMonitor, @@ -141,6 +141,11 @@ export function buildCreateBody(opts: { page?: string; urls?: string[]; crawlUrl?: string; + queries?: string[]; + searchWindow?: string; + maxResults?: number; + includeDomains?: string[]; + excludeDomains?: string[]; webhookUrl?: string; webhookEvents?: string[]; emailRecipients?: string[]; @@ -160,8 +165,14 @@ export function buildCreateBody(opts: { : undefined; const hasScrape = urls && urls.length > 0; const hasCrawl = !!opts.crawlUrl; - if (!hasScrape && !hasCrawl) { - throw new Error('Provide --scrape-urls or --crawl-url'); + const hasSearch = !!(opts.queries && opts.queries.length > 0); + if (!hasScrape && !hasCrawl && !hasSearch) { + throw new Error('Provide --scrape-urls, --crawl-url, or --queries'); + } + // The API requires a non-empty goal whenever a search target is present + // (it auto-enables the AI judge). Fail early with a clear message. + if (hasSearch && (!opts.goal || !opts.goal.trim())) { + throw new Error('--goal is required for web monitors (--queries)'); } const schedule: Record = {}; @@ -172,6 +183,20 @@ export function buildCreateBody(opts: { const targets: unknown[] = []; if (hasScrape) targets.push({ type: 'scrape', urls }); if (hasCrawl) targets.push({ type: 'crawl', url: opts.crawlUrl }); + if (hasSearch) { + const searchTarget: Record = { + type: 'search', + queries: opts.queries, + }; + if (opts.searchWindow) searchTarget.searchWindow = opts.searchWindow; + if (opts.maxResults !== undefined) + searchTarget.maxResults = opts.maxResults; + if (opts.includeDomains && opts.includeDomains.length > 0) + searchTarget.includeDomains = opts.includeDomains; + if (opts.excludeDomains && opts.excludeDomains.length > 0) + searchTarget.excludeDomains = opts.excludeDomains; + targets.push(searchTarget); + } const body: Record = { name: opts.name, @@ -219,7 +244,7 @@ function commonOptions(cmd: Command): Command { */ export function createMonitorCommand(): Command { const monitor = new Command('monitor').description( - 'Schedule recurring scrapes/crawls and track content changes' + 'Schedule recurring scrapes/crawls/searches and track content changes' ); // create @@ -245,6 +270,30 @@ export function createMonitorCommand(): Command { parseCommaList ) .option('--crawl-url ', 'Root URL for a crawl target') + .option( + '--queries ', + 'Comma-separated search queries for a search target (requires --goal)', + parseCommaList + ) + .option( + '--search-window ', + 'Search recency window: 5m, 15m, 1h, 6h, 24h, 7d (default: 24h)' + ) + .option( + '--max-results ', + 'Max search results per query, 1-50 (default: 10)', + parseInt + ) + .option( + '--include-domains ', + 'Comma-separated domains to restrict search results to', + parseCommaList + ) + .option( + '--exclude-domains ', + 'Comma-separated domains to exclude from search results', + parseCommaList + ) .option('--webhook-url ', 'Webhook destination') .option( '--webhook-events ', @@ -275,6 +324,11 @@ export function createMonitorCommand(): Command { page: options.page, urls: options.scrapeUrls, crawlUrl: options.crawlUrl, + queries: options.queries, + searchWindow: options.searchWindow, + maxResults: options.maxResults, + includeDomains: options.includeDomains, + excludeDomains: options.excludeDomains, webhookUrl: options.webhookUrl, webhookEvents: options.webhookEvents, emailRecipients: options.email, @@ -480,6 +534,11 @@ Examples: --schedule "every 30 minutes" \\ --page https://example.com/blog \\ --email alerts@example.com + $ firecrawl monitor create --name "LLM releases" \\ + --goal "Notify me about major new LLM model releases" \\ + --schedule "every 2 hours" \\ + --queries "new LLM release,frontier model launch" \\ + --search-window 24h --max-results 10 $ firecrawl monitor create monitor.json $ cat monitor.json | firecrawl monitor create $ firecrawl monitor list --limit 20