diff --git a/crawlee/Dockerfile b/crawlee/Dockerfile deleted file mode 100644 index 4018d6b..0000000 --- a/crawlee/Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -# Specify the base Docker image. You can read more about -# the available images at https://crawlee.dev/docs/guides/docker-images -# You can also use any other image from Docker Hub. -FROM apify/actor-node:16 - -# Copy just package.json and package-lock.json -# to speed up the build using Docker layer cache. -COPY package*.json ./ - -# Install NPM packages, skip optional and development dependencies to -# keep the image small. Avoid logging too much and print the dependency -# tree for debugging -RUN npm --quiet set progress=false \ - && npm install --omit=dev --omit=optional \ - && echo "Installed NPM packages:" \ - && (npm list --omit=dev --all || true) \ - && echo "Node.js version:" \ - && node --version \ - && echo "NPM version:" \ - && npm --version - -# Next, copy the remaining files and directories with the source code. -# Since we do this after NPM install, quick build will be really fast -# for most source file changes. -COPY . ./ - - -# Run the image. -CMD npm start --silent diff --git a/crawlee/README.md b/crawlee/README.md index 616b5c2..b8986f5 100644 --- a/crawlee/README.md +++ b/crawlee/README.md @@ -5,23 +5,16 @@ crawlee scraping and browser automation library. ```bash -$ docker-compose build -Building crawlee -Successfully built xxxxxxxxxxxx -Successfully tagged crawlee:latest +$ docker run --rm -e PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 -e PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 -v $PWD:/tmp -w /tmp apify/actor-node:16 npx crawlee create -t cheerio-js my-crawler -$ docker-compose run --rm crawlee -INFO BasicCrawler: Starting the crawl -INFO BasicCrawler: Processing ... -Crawler finished. +$ docker-compose build my-crawler -$ tree data +$ docker-compose run --rm my-crawler + +$ tree my-crawler/storage/ ├── datasets │   └── default -│   ├── 000000001.json -│   ├── 000000002.json -│   ├── 000000003.json -│   └── 000000004.json +│   └── 000000001.json ├── key_value_stores └── request_queues ``` diff --git a/crawlee/docker-compose.yml b/crawlee/docker-compose.yml index 81f2dd7..2a78e34 100644 --- a/crawlee/docker-compose.yml +++ b/crawlee/docker-compose.yml @@ -1,7 +1,11 @@ version: "3.8" + services: - crawlee: - image: crawlee - build: . + + my-crawler: + image: my-crawler + build: + context: my-crawler + dockerfile: Dockerfile volumes: - - ./data:/usr/src/app/storage + - ./my-crawler/storage:/usr/src/app/storage diff --git a/crawlee/main.js b/crawlee/main.js deleted file mode 100644 index f1e4584..0000000 --- a/crawlee/main.js +++ /dev/null @@ -1,35 +0,0 @@ -import { BasicCrawler, Dataset } from 'crawlee'; - -// Create a BasicCrawler - the simplest crawler that enables -// users to implement the crawling logic themselves. -const crawler = new BasicCrawler({ - // This function will be called for each URL to crawl. - async requestHandler({ request, sendRequest, log }) { - const { url } = request; - log.info(`Processing ${url}...`); - - // Fetch the page HTML via the crawlee sendRequest utility method - // By default, the method will use the current request that is being handled, so you don't have to - // provide it yourself. You can also provide a custom request if you want. - const { body } = await sendRequest(); - - // Store the HTML and URL to the default dataset. - await Dataset.pushData({ - url, - html: body, - }); - }, -}); - -// The initial list of URLs to crawl. Here we use just a few hard-coded URLs. -await crawler.addRequests([ - 'https://www.google.com', - 'https://www.example.com', - 'https://www.bing.com', - 'https://www.wikipedia.com', -]); - -// Run the crawler and wait for it to finish. -await crawler.run(); - -console.log('Crawler finished.'); diff --git a/crawlee/package.json b/crawlee/package.json deleted file mode 100644 index adb49ff..0000000 --- a/crawlee/package.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "description": "Crawlee Demo Project", - "version": "0.0.1", - "license": "UNLICENSED", - "type": "module", - "main": "main.js", - "scripts": { - "start": "node main.js" - }, - "dependencies": { - "crawlee": "*" - }, - "repository": {} -}