feat(web): Use crawlee/playwright to retrieve web content in quality mode. It retrieves content more reliably than fetch + JSDoc, at the expense of speed.

This commit is contained in:
Willie Zutz 2025-05-24 14:37:19 -06:00
parent 044f30a547
commit 87a7ffb445
10 changed files with 4580 additions and 549 deletions

View file

@ -1,9 +1,10 @@
FROM node:20.18.0-slim AS builder
FROM --platform=linux/amd64 node:20-slim AS builder
WORKDIR /home/perplexica
COPY package.json yarn.lock ./
RUN yarn install --frozen-lockfile --network-timeout 600000
ENV NEXT_TELEMETRY_DISABLED=1
COPY tsconfig.json next.config.mjs next-env.d.ts postcss.config.js drizzle.config.ts tailwind.config.ts ./
COPY src ./src
@ -12,7 +13,9 @@ COPY public ./public
RUN mkdir -p /home/perplexica/data
RUN yarn build
FROM node:20.18.0-slim
FROM --platform=linux/amd64 node:20-slim
ENV NEXT_TELEMETRY_DISABLED=1
WORKDIR /home/perplexica
@ -22,6 +25,11 @@ COPY --from=builder /home/perplexica/.next/static ./public/_next/static
COPY --from=builder /home/perplexica/.next/standalone ./
COPY --from=builder /home/perplexica/data ./data
RUN mkdir /home/perplexica/uploads
RUN mkdir /home/perplexica/uploads && \
npx -y playwright install chromium --with-deps && \
npm install playwright && \
apt-get update && \
apt-get install -y procps && \
apt-get clean && rm -rf /var/lib/apt/lists/*
CMD ["node", "server.js"]