See shub documentation for the custom Docker image deployment instructions.
Chrome
Dockerfile Example for Chrome
FROM scrapinghub/scrapinghub-stack-scrapy:2.1 RUN apt-get update RUN apt-get upgrade -y RUN apt-get install zip unzip #============================================ # Google Chrome #============================================ # can specify versions by CHROME_VERSION; # e.g. google-chrome-stable=53.0.2785.101-1 # google-chrome-beta=53.0.2785.92-1 # google-chrome-unstable=54.0.2840.14-1 # latest (equivalent to google-chrome-stable) # google-chrome-beta (pull latest beta) #============================================ ARG CHROME_VERSION="google-chrome-stable" RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ && echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \ && apt-get update -qqy \ && apt-get -qqy install \ ${CHROME_VERSION:-google-chrome-stable} \ && rm /etc/apt/sources.list.d/google-chrome.list \ && rm -rf /var/lib/apt/lists/* /var/cache/apt/* #============================================ # Chrome Webdriver #============================================ # can specify versions by CHROME_DRIVER_VERSION # Latest released version will be used by default #============================================ ARG CHROME_DRIVER_VERSION RUN CHROME_STRING=$(google-chrome --version) \ && CHROME_VERSION_STRING=$(echo "${CHROME_STRING}" | grep -oP "\d+\.\d+\.\d+\.\d+") \ && CHROME_MAYOR_VERSION=$(echo "${CHROME_VERSION_STRING%%.*}") \ && wget --no-verbose -O /tmp/LATEST_RELEASE "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_MAYOR_VERSION}" \ && CD_VERSION=$(cat "/tmp/LATEST_RELEASE") \ && rm /tmp/LATEST_RELEASE \ && if [ -z "$CHROME_DRIVER_VERSION" ]; \ then CHROME_DRIVER_VERSION="${CD_VERSION}"; \ fi \ && CD_VERSION=$(echo $CHROME_DRIVER_VERSION) \ && echo "Using chromedriver version: "$CD_VERSION \ && wget --no-verbose -O /tmp/chromedriver_linux64.zip https://chromedriver.storage.googleapis.com/$CD_VERSION/chromedriver_linux64.zip \ && rm -rf /opt/selenium/chromedriver \ && unzip /tmp/chromedriver_linux64.zip -d /opt/selenium \ && rm /tmp/chromedriver_linux64.zip \ && mv /opt/selenium/chromedriver /opt/selenium/chromedriver-$CD_VERSION \ && chmod 755 /opt/selenium/chromedriver-$CD_VERSION \ && sudo ln -fs /opt/selenium/chromedriver-$CD_VERSION /usr/bin/chromedriver ENV TERM xterm ENV SCRAPY_SETTINGS_MODULE cVehicles.settings RUN mkdir -p /app WORKDIR /app COPY ./requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r requirements.txt COPY . /app RUN python setup.py install
Replace <PROJECT_NAME> in ENV SCRAPY_SETTINGS_MODULE instruction with the actual name of the Scrapy project being deployed.
Spider Code Example for Chrome
# demo.py import scrapy from selenium import webdriver class DemoSpider(scrapy.Spider): name = 'demo' start_urls = ['http://quotes.toscrape.com/js'] def __init__(self, *args, **kwargs): super(DemoSpider, self).__init__(*args, **kwargs) options = webdriver.ChromeOptions() options.add_argument("--disable-extensions") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") self.driver = webdriver.Chrome(chrome_options=options, executable_path='/usr/bin/chromedriver') def parse(self, response): self.driver.get(response.url) for quote in self.driver.find_elements_by_css_selector('div.quote'): yield { 'quote': quote.find_element_by_css_selector('span').text, 'author': quote.find_element_by_css_selector('small').text, } next_page_url = response.css('nav li.next a ::attr(href)').extract_first() if next_page_url: yield scrapy.Request(response.urljoin(next_page_url))
Firefox
Dockerfile Example for Firefox
FROM scrapinghub/scrapinghub-stack-scrapy:1.3 RUN apt-get install unzip RUN printf "deb http://archive.debian.org/debian/ jessie main\ndeb-src http://archive.debian.org/debian/ jessie main\ndeb http://security.debian.org jessie/updates main\ndeb-src http://security.debian.org jessie/updates main" > /etc/apt/sources.list #============================================ # Firefox and Geckodriver #============================================ RUN apt-get update \ && apt-get install -y --no-install-recommends \ ca-certificates curl firefox-esr \ && rm -fr /var/lib/apt/lists/* \ && curl -L https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz | tar xz -C /usr/local/bin \ && apt-get purge -y ca-certificates curl ENV TERM xterm ENV SCRAPY_SETTINGS_MODULE <PROJECT_NAME>.settings RUN mkdir -p /app WORKDIR /app COPY ./requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r requirements.txt COPY . /app RUN python setup.py install
Alternatively, for using Firefox with Python 3:
FROM scrapinghub/scrapinghub-stack-scrapy:1.8-py3
RUN apt-get update
RUN apt-get install -y unzip procps
RUN printf "deb [trusted=yes] http://archive.debian.org/debian/ jessie main\ndeb-src [trusted=yes] http://archive.debian.org/debian/ jessie main\ndeb [trusted=yes] http://security.debian.org jessie/updates main\ndeb-src [trusted=yes] http://security.debian.org jessie/updates main" > /etc/apt/sources.list
#============================================
# Firefox and Geckodriver
#============================================
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
ca-certificates curl firefox-esr \
&& rm -fr /var/lib/apt/lists/* \
&& curl -L https://github.com/mozilla/geckodriver/releases/download/v0.29.1/geckodriver-v0.29.1-linux64.tar.gz | tar xz -C /usr/local/bin \
&& apt-get purge -y ca-certificates curl ENV TERM xterm ENV SCRAPY_SETTINGS_MODULE <PROJECT_NAME>.settings RUN mkdir -p /app WORKDIR /app COPY ./requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r requirements.txt COPY . /app RUN python setup.py install
Replace <PROJECT_NAME> in ENV SCRAPY_SETTINGS_MODULE instruction with the actual name of the Scrapy project being deployed.
Spider Code Example for Firefox
# demo.py import scrapy from selenium import webdriver class DemoSpider(scrapy.Spider): name = 'demo' start_urls = ['http://quotes.toscrape.com/js'] def __init__(self, *args, **kwargs): super(DemoSpider, self).__init__(*args, **kwargs) options = webdriver.FirefoxOptions() options.add_argument("--window-size 1920,1080") options.add_argument("--headless") self.driver = webdriver.Firefox(options=options) def parse(self, response): self.driver.get(response.url) for quote in self.driver.find_elements_by_css_selector('div.quote'): yield { 'quote': quote.find_element_by_css_selector('span').text, 'author': quote.find_element_by_css_selector('small').text, } next_page_url = response.css('nav li.next a ::attr(href)').extract_first() if next_page_url: yield scrapy.Request(response.urljoin(next_page_url))
Quick Start Steps
- For a jump start, download https://github.com/scrapinghub/sample-projects/tree/master/sc_custom_image.
- Create a Dockerfile in sc_custom_image root folder (where scrapy.cfg is), copy/paste the content of either Dockerfile example above, and replace <PROJECT_NAME> with sc_custom_image.
- Update scrapinghub.yml with the numerical ID of the Scrapy Cloud project that will contain the spider being deployed.
- Create setup.py file, taking this one as an example: https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/setup.py (omit package_data declaration line).
- Replace demo.py located at sc_custom_image/sc_custom_image/spiders with the content of either spider code example above.
- Run shub deploy.