FireCrawl本地安装
docker-compose.yaml
name: firecrawl
x-common-service: &common-service
# NOTE: If you don't want to build the service locally,
# comment out the build: statement and uncomment the image: statement
image: ghcr.io/firecrawl/firecrawl:latest
# build: apps/api
ulimits:
nofile:
soft: 65535
hard: 65535
networks:
- backend
extra_hosts:
- "host.docker.internal:host-gateway"
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
compress: "true"
x-common-env: &common-env
REDIS_URL: ${REDIS_URL:-redis://redis:6379}
REDIS_RATE_LIMIT_URL: ${REDIS_URL:-redis://redis:6379}
PLAYWRIGHT_MICROSERVICE_URL: ${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000/scrape}
POSTGRES_USER: ${POSTGRES_USER:-postgres}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres}
POSTGRES_DB: ${POSTGRES_DB:-postgres}
POSTGRES_HOST: ${POSTGRES_HOST:-nuq-postgres}
POSTGRES_PORT: ${POSTGRES_PORT:-5432}
USE_DB_AUTHENTICATION: ${USE_DB_AUTHENTICATION:-false}
NUM_WORKERS_PER_QUEUE: ${NUM_WORKERS_PER_QUEUE:-8}
CRAWL_CONCURRENT_REQUESTS: ${CRAWL_CONCURRENT_REQUESTS:-10}
MAX_CONCURRENT_JOBS: ${MAX_CONCURRENT_JOBS:-5}
BROWSER_POOL_SIZE: ${BROWSER_POOL_SIZE:-5}
OPENAI_API_KEY: ${OPENAI_API_KEY}
OPENAI_BASE_URL: ${OPENAI_BASE_URL}
MODEL_NAME: ${MODEL_NAME}
MODEL_EMBEDDING_NAME: ${MODEL_EMBEDDING_NAME}
OLLAMA_BASE_URL: ${OLLAMA_BASE_URL}
AUTUMN_SECRET_KEY: ${AUTUMN_SECRET_KEY}
SLACK_WEBHOOK_URL: ${SLACK_WEBHOOK_URL}
BULL_AUTH_KEY: ${BULL_AUTH_KEY}
TEST_API_KEY: ${TEST_API_KEY}
SUPABASE_ANON_TOKEN: ${SUPABASE_ANON_TOKEN}
SUPABASE_URL: ${SUPABASE_URL}
SUPABASE_SERVICE_TOKEN: ${SUPABASE_SERVICE_TOKEN}
SELF_HOSTED_WEBHOOK_URL: ${SELF_HOSTED_WEBHOOK_URL}
LOGGING_LEVEL: ${LOGGING_LEVEL}
PROXY_SERVER: ${PROXY_SERVER}
PROXY_USERNAME: ${PROXY_USERNAME}
PROXY_PASSWORD: ${PROXY_PASSWORD}
SEARXNG_ENDPOINT: ${SEARXNG_ENDPOINT}
SEARXNG_ENGINES: ${SEARXNG_ENGINES}
SEARXNG_CATEGORIES: ${SEARXNG_CATEGORIES}
MAX_CPU: ${MAX_CPU:-0.8}
MAX_RAM: ${MAX_RAM:-0.8}
ALLOW_LOCAL_WEBHOOKS: ${ALLOW_LOCAL_WEBHOOKS:-false}
services:
playwright-service:
# NOTE: If you don't want to build the service locally,
# comment out the build: statement and uncomment the image: statement
image: ghcr.io/firecrawl/playwright-service:latest
# build: apps/playwright-service-ts
environment:
PORT: 3000
PROXY_SERVER: ${PROXY_SERVER}
PROXY_USERNAME: ${PROXY_USERNAME}
PROXY_PASSWORD: ${PROXY_PASSWORD}
ALLOW_LOCAL_WEBHOOKS: ${ALLOW_LOCAL_WEBHOOKS:-false}
BLOCK_MEDIA: ${BLOCK_MEDIA:-false}
# Configure maximum concurrent pages for Playwright browser instances
MAX_CONCURRENT_PAGES: ${CRAWL_CONCURRENT_REQUESTS:-10}
networks:
- backend
# Resource limits for Docker Compose (not Swarm)
cpus: 2.0
mem_limit: 4G
memswap_limit: 4G
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
compress: "true"
tmpfs:
- /tmp/.cache:noexec,nosuid,size=1g
api:
<<: *common-service
environment:
<<: *common-env
HOST: "0.0.0.0"
PORT: ${INTERNAL_PORT:-3002}
EXTRACT_WORKER_PORT: ${EXTRACT_WORKER_PORT:-3004}
WORKER_PORT: ${WORKER_PORT:-3005}
NUQ_RABBITMQ_URL: amqp://rabbitmq:5672
ENV: local
depends_on:
redis:
condition: service_started
playwright-service:
condition: service_started
rabbitmq:
condition: service_healthy
nuq-postgres:
condition: service_started
ports:
- "${PORT:-3002}:${INTERNAL_PORT:-3002}"
command: node dist/src/harness.js --start-docker
# Resource limits for Docker Compose (not Swarm)
# Increase if you have more CPU cores/RAM available
cpus: 4.0
mem_limit: 8G
memswap_limit: 8G
redis:
# NOTE: If you want to use Valkey (open source) instead of Redis (source available),
# uncomment the Valkey statement and comment out the Redis statement.
# Using Valkey with Firecrawl is untested and not guaranteed to work. Use with caution.
image: redis:alpine
# image: valkey/valkey:alpine
networks:
- backend
command: redis-server --bind 0.0.0.0
volumes:
- redis-data:/data
logging:
driver: "json-file"
options:
max-size: "5m"
max-file: "2"
compress: "true"
rabbitmq:
image: rabbitmq:3-management
networks:
- backend
command: rabbitmq-server
healthcheck:
test: ["CMD", "rabbitmq-diagnostics", "-q", "check_running"]
interval: 10s
timeout: 10s
retries: 10
start_period: 30s
volumes:
- rabbitmq-data:/var/lib/rabbitmq
logging:
driver: "json-file"
options:
max-size: "5m"
max-file: "2"
compress: "true"
nuq-postgres:
image: ghcr.io/firecrawl/nuq-postgres:latest
environment:
POSTGRES_USER: ${POSTGRES_USER:-postgres}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-postgres}
POSTGRES_DB: ${POSTGRES_DB:-postgres}
networks:
- backend
volumes:
- postgres-data:/var/lib/postgresql/data
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
compress: "true"
networks:
backend:
driver: bridge
volumes:
redis-data:
driver: local
rabbitmq-data:
driver: local
postgres-data:
driver: local
.env
# ===== Required ENVS ======
PORT=8019
HOST=0.0.0.0
# Note: PORT is used by both the main API server and worker liveness check endpoint
# To turn on DB authentication, you need to set up Supabase.
USE_DB_AUTHENTICATION=false
# ===== Optional ENVS ======
## === AI features (JSON format on scrape, /extract API) ===
# Provide your OpenAI API key here to enable AI features
# OPENAI_API_KEY=your_openai_api_key_here
# Experimental: Use Ollama
# OLLAMA_BASE_URL=http://localhost:11434/api
# MODEL_NAME=deepseek-r1:7b
# MODEL_EMBEDDING_NAME=nomic-embed-text
# Experimental: Use any OpenAI-compatible API
# OPENAI_BASE_URL=https://example.com/v1
# OPENAI_API_KEY=your_api_key_here
## === Proxy ===
# PROXY_SERVER can be a full URL (e.g. http://0.1.2.3:1234) or just an IP and port combo (e.g. 0.1.2.3:1234)
# Do not uncomment PROXY_USERNAME and PROXY_PASSWORD if your proxy is unauthenticated
# PROXY_SERVER=
# PROXY_USERNAME=
# PROXY_PASSWORD=
## === /search API ===
# By default, the /search API will use Google search.
# You can specify a SearXNG server with the JSON format enabled, if you'd like to use that instead of direct Google.
# You can also customize the engines and categories parameters, but the defaults should also work just fine.
# SEARXNG_ENDPOINT=http://your.searxng.server
# SEARXNG_ENGINES=
# SEARXNG_CATEGORIES=
## === PostgreSQL Database Configuration ===
# Configure PostgreSQL credentials. These should match the credentials used by the nuq-postgres container.
# If you change these, ensure all three are set consistently.
# Note: nuq-postgres requires using 'postgres' as the database name for proper pg_cron initialization
POSTGRES_USER=postgres
POSTGRES_PASSWORD=postgres
POSTGRES_DB=postgres
## === Redis Configuration ===
# These are auto-configured by docker-compose.yaml. You shouldn't need to change them.
# REDIS_URL=redis://redis:6379
# REDIS_RATE_LIMIT_URL=redis://redis:6379
## === Playwright Service ===
# This is auto-configured by docker-compose.yaml. You shouldn't need to change it.
# PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/scrape
## === Supabase Setup (used to support DB authentication, advanced logging, etc.) ===
# SUPABASE_ANON_TOKEN=
# SUPABASE_URL=
# SUPABASE_SERVICE_TOKEN=
# Use if you've set up authentication and want to test with a real API key
# TEST_API_KEY=
# This key lets you access the queue admin panel. Change this if your deployment is publicly accessible.
BULL_AUTH_KEY=CHANGEME
## === PDF Parsing ===
# Set if you have a llamaparse key you'd like to use to parse pdfs
# LLAMAPARSE_API_KEY=
## === Monitoring ===
# Set if you'd like to send server health status messages to Slack
# SLACK_WEBHOOK_URL=
# Set if you'd like to send posthog events like job logs
# POSTHOG_API_KEY=
# POSTHOG_HOST=
## === System Resource Configuration ===
# Maximum CPU usage threshold (0.0-1.0). Worker will reject new jobs when CPU usage exceeds this value.
# Default: 0.8 (80%)
MAX_CPU=0.8
# Maximum RAM usage threshold (0.0-1.0). Worker will reject new jobs when memory usage exceeds this value.
# Default: 0.8 (80%)
MAX_RAM=0.8
# Number of workers per queue
NUM_WORKERS_PER_QUEUE=8
# Concurrent requests for crawling
CRAWL_CONCURRENT_REQUESTS=10
# Maximum concurrent jobs
MAX_CONCURRENT_JOBS=5
# Browser pool size
BROWSER_POOL_SIZE=5
# Set if you'd like to allow local webhooks to be sent to your self-hosted instance
# ALLOW_LOCAL_WEBHOOKS=true
# Block media in Playwright
# BLOCK_MEDIA=true
# Logging level (DEBUG, INFO, WARN, ERROR)
LOGGING_LEVEL=INFO
现在我们要操作一个网站,通常要打开浏览器,点不同的按钮进行操作,不同的按钮代表了不同的功能。
cli将网站的这些功能,在命令行中完成,cli-boss search 就能返回职位的数据,当然是json或其他文本格式,对于人类是阅读不方便的,但对于ai是非常友好。
也就是搜索网站的职位,在cli中对应cli-boss search。
cli安装
#添加CLI-Anything插件市场
/plugin marketplace add HKUDS/CLI-Anything
#从市场安装 cli-anything插件
/plugin install cli-anything
制作专属cli流程
生成专属cli所需的代码,如python
克隆 https://github.com/jgraph/drawio
/cli-anything:cli-anything ./drawio
执行后,会生成一个cli_anything文件夹,里面有各种说明文件和源代码,包括了用户输入命令行后,背后需做的动作
编译源码,生成命令
#根据readme.md来操作
cd cli_anything
pip install -e .
使用专属cli
# Create a new diagram
filecii-anything-drawio project new mydiagram.drawio
# Open an existing
diagramcli-anything-drawio project open mydiagram.drawio
# Add a shape
cli-anything-drawio edit add --value "Hello" --x 100 --y 100
# List all elements
cli-anything-drawio edit list
#其他各种见cli_anything/readme.md
借助agent使用专属cli
你借助 cli-anything-drawio里面的CLI命令画一个快速排序算法的流程图
环境
hiclaw docker
cat /etc/os-release
PRETTY_NAME="Ubuntu 22.04.5 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.5 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
UBUNTU_CODENAME=jammy
安装
npm install -g @playwright/cli@latest
安装chrome
npx playwright install chrome
操作
playwright-cli open www.cnn.com
### Browser `default` opened with pid 159651.
### Ran Playwright code
```js
await page.goto('https://www.cnn.com');
```
### Page
- Page URL: https://edition.cnn.com/
### Snapshot
- [Snapshot](.playwright-cli/page-2026-04-12T02-21-31-067Z.yml)
安装skill
cd ~
playwright-cli install --skills
默认会在当前目录下生成 .claude/skills/playwright-cli
得将playwright-cli搬到不同agent的skills目录下
案例1:打开页面,输入内容
用paywight-cLi参数--headed --persistent,打开grok,问问今天青岛天气怎么样?
案例2:打开页面,需点击加载数据
用playwrightcli 带参数 --headed --persistent 查看https://detail.tmall.com/item.htm?&xxc=taobaoSearchabbucket=4&id=612098145454&mi_id=0000AWGjgiGB7oSCDLfegRXMs0rDPkp34S50rNrByn86yAY&ns=1&skuId=5872701131539&spm-a21n57.1.hoverItem.2utparam-%7B%22aplus_abtest%22%3A%2240acb849856085978c81b6cdec803550%22%70D
前100条评论保存到一个csV文件里面。
让ai保存成一个skill
创建一个save_mall_comments skill把刚才打开网站,查看评论,并且保存评论的全过程,遇到的坑都提炼出来保存到skill里面
直接保存成一个不依赖ai的可执行脚本
你把刚才所有的playWwright CLI命令汇总成成一个脚本,执行脚本就能获取商品前100条评论,并且保存到一个CSV文件里面。注意每一步都有合理的延时与等待,确保任务成功。脚本写完你先测试一轮。
下次直接执行
./save_mall_comments.sh "https://detail.tmall.com/item.htm?id=612098145454&skuId=5872701131539" 100 --output tmall_reviews_612098145454_top100.csv --close-browser
案例3:web自动化测试
阅读代码,把从注册开始的主体流程写一个中文的测试文档,只测试主流程即可。然后用playwright-cli open --headed --persistent 打开网页,根据你的测试用例,完成进行测试,
还可以借助openclaw,进行定时自动化测试
案例4:在x上发推
playwright-cli https://x.com/compose/articles --headed --persistent 打开网站,创建写一个新的文章,把 为什么巨头都在做CLI.local.html 的内容粘贴进去。
然后找到所有的"1f4f7.svg"小icon的位置,按Backspace键删除,然后复制images文件夹里面的图片,按Ctrl+V粘贴进去。"1f4f7.svg"小icon数量跟图片数量是相等的,你要按顺序替换。复制操作你应使用操作系统的CLI命令,粘贴操作在浏览器里按Ctrl+V就行了。
整理成skill
好,非常好,你把从把文章里的图片下载到本地,开始的全流程,整理成一个skills,放到项目目录。以后我只要给你一个文章,你就能自动发布
如何使用这个skill
http://xxx.xxx.xxx/xxx.html 文章路径,用 x-article-auto-publisher自动发布
与大模型的对话,如果之后其他人非当前用户的问题如果与之前的用户问的问题类似,可迅速从缓存中取出,无需再走LLM。
使用ai网关Higress,此动作在服务端Higress中完成,客户端无需任何代码。
在milvus的vector db中新加collection,名称:
ai_higress_cache,和以下字段:
Field, Type, Index Name, Index Type, Index Parameters
id,auto id, Int64
vector, FloatVector(4096), vector, metric_type:COSINE
question, VarChar(5000)
answer, VarChar(5000)
#这三个字段vector, question, answer是必需的,且名字不能改
前期需要配置做embedding的服务,VECTOR DB的服务,均可在服务来源中完成。
在“ai路由管理”中,点击某个路由的策略,点击配置,输入以下yaml配置
embedding:
apiKey: "sk-xxxxxxx"
model: "nvidia/llama-embed-nemotron-8b"
path: "/v1/embeddings"
serviceName: "llm-vllm-nvidia--llama-embed-nemotron-8b.internal.static"
servicePort: 80
type: "openai"
vector:
apiKey: "empty-key"
collectionID: "ai_higress_cache"
serviceName: "my-milvus.static"
servicePort: 80
type: "milvus"
cacheKeyFrom: "messages.@reverse.0.content"
cacheKeyPrefix: "openai_gpt_oss_20b_"
cacheStreamValueFrom: "choices.0.delta.content"
cacheValueFrom: "choices.0.message.content"
returnResponseTemplate: |
{"id":"from-cache","choices":[{"index":0,"message":{"role":"assistant","content":"%s"},"finish_reason":"stop"}],"model":"gpt-4o","object":"chat.completion","usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}
returnStreamResponseTemplate: |-
data:{"id":"from-cache","choices":[{"index":0,"delta":{"role":"assistant","content":"%s"},"finish_reason":"stop"}],"model":"gpt-4o","object":"chat.completion","usage":{"prompt_tokens":0,"completion_tokens":0,"total_tokens":0}}
data:[DONE]
参照:
https://higress.ai/docs/latest/user/plugins/ai/api-provider/ai-cache/
安装命令
curl -LsSf https://astral.sh/uv/install.sh | sh
同步项目环境,并指定python版本