See shub documentation for the custom Docker image deployment instructions.
Chrome
Dockerfile Example for Chrome
FROM scrapinghub/scrapinghub-stack-scrapy:2.1 RUN apt-get update RUN apt-get upgrade -y RUN apt-get install zip unzip #============================================ # Google Chrome #============================================ # can specify versions by CHROME_VERSION; # e.g. google-chrome-stable=53.0.2785.101-1 # google-chrome-beta=53.0.2785.92-1 # google-chrome-unstable=54.0.2840.14-1 # latest (equivalent to google-chrome-stable) # google-chrome-beta (pull latest beta) #============================================ ARG CHROME_VERSION="google-chrome-stable" RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ && echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list \ && apt-get update -qqy \ && apt-get -qqy install \ ${CHROME_VERSION:-google-chrome-stable} \ && rm /etc/apt/sources.list.d/google-chrome.list \ && rm -rf /var/lib/apt/lists/* /var/cache/apt/* #============================================ # Chrome Webdriver #============================================ # can specify versions by CHROME_DRIVER_VERSION # Latest released version will be used by default #============================================ ARG CHROME_DRIVER_VERSION RUN CHROME_STRING=$(google-chrome --version) \ && CHROME_VERSION_STRING=$(echo "${CHROME_STRING}" | grep -oP "\d+\.\d+\.\d+\.\d+") \ && CHROME_MAYOR_VERSION=$(echo "${CHROME_VERSION_STRING%%.*}") \ && wget --no-verbose -O /tmp/LATEST_RELEASE "https://chromedriver.storage.googleapis.com/LATEST_RELEASE_${CHROME_MAYOR_VERSION}" \ && CD_VERSION=$(cat "/tmp/LATEST_RELEASE") \ && rm /tmp/LATEST_RELEASE \ && if [ -z "$CHROME_DRIVER_VERSION" ]; \ then CHROME_DRIVER_VERSION="${CD_VERSION}"; \ fi \ && CD_VERSION=$(echo $CHROME_DRIVER_VERSION) \ && echo "Using chromedriver version: "$CD_VERSION \ && wget --no-verbose -O /tmp/chromedriver_linux64.zip https://chromedriver.storage.googleapis.com/$CD_VERSION/chromedriver_linux64.zip \ && rm -rf /opt/selenium/chromedriver \ && unzip /tmp/chromedriver_linux64.zip -d /opt/selenium \ && rm /tmp/chromedriver_linux64.zip \ && mv /opt/selenium/chromedriver /opt/selenium/chromedriver-$CD_VERSION \ && chmod 755 /opt/selenium/chromedriver-$CD_VERSION \ && sudo ln -fs /opt/selenium/chromedriver-$CD_VERSION /usr/bin/chromedriver ENV TERM xterm ENV SCRAPY_SETTINGS_MODULE cVehicles.settings RUN mkdir -p /app WORKDIR /app COPY ./requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r requirements.txt COPY . /app RUN python setup.py install
Replace <PROJECT_NAME> in ENV SCRAPY_SETTINGS_MODULE instruction with the actual name of the Scrapy project being deployed.
Spider Code Example for Chrome
# demo.py import scrapy from selenium import webdriver class DemoSpider(scrapy.Spider): name = 'demo' start_urls = ['http://quotes.toscrape.com/js'] def __init__(self, *args, **kwargs): super(DemoSpider, self).__init__(*args, **kwargs) options = webdriver.ChromeOptions() options.add_argument("--disable-extensions") options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") self.driver = webdriver.Chrome(chrome_options=options, executable_path='/usr/bin/chromedriver') def parse(self, response): self.driver.get(response.url) for quote in self.driver.find_elements_by_css_selector('div.quote'): yield { 'quote': quote.find_element_by_css_selector('span').text, 'author': quote.find_element_by_css_selector('small').text, } next_page_url = response.css('nav li.next a ::attr(href)').extract_first() if next_page_url: yield scrapy.Request(response.urljoin(next_page_url))
Firefox
Dockerfile Example for Firefox
FROM scrapinghub/scrapinghub-stack-scrapy:1.3 RUN apt-get install unzip RUN printf "deb http://archive.debian.org/debian/ jessie main\ndeb-src http://archive.debian.org/debian/ jessie main\ndeb http://security.debian.org jessie/updates main\ndeb-src http://security.debian.org jessie/updates main" > /etc/apt/sources.list #============================================ # Firefox and Geckodriver #============================================ RUN apt-get update \ && apt-get install -y --no-install-recommends \ ca-certificates curl firefox-esr \ && rm -fr /var/lib/apt/lists/* \ && curl -L https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz | tar xz -C /usr/local/bin \ && apt-get purge -y ca-certificates curl ENV TERM xterm ENV SCRAPY_SETTINGS_MODULE <PROJECT_NAME>.settings RUN mkdir -p /app WORKDIR /app COPY ./requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r requirements.txt COPY . /app RUN python setup.py install
Alternatively, for using Firefox with Python 3:
FROM scrapinghub/scrapinghub-stack-scrapy:1.8-py3
RUN apt-get update
RUN apt-get install -y unzip procps
RUN printf "deb [trusted=yes] http://archive.debian.org/debian/ jessie main\ndeb-src [trusted=yes] http://archive.debian.org/debian/ jessie main\ndeb [trusted=yes] http://security.debian.org jessie/updates main\ndeb-src [trusted=yes] http://security.debian.org jessie/updates main" > /etc/apt/sources.list
#============================================
# Firefox and Geckodriver
#============================================
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
ca-certificates curl firefox-esr \
&& rm -fr /var/lib/apt/lists/* \
&& curl -L https://github.com/mozilla/geckodriver/releases/download/v0.29.1/geckodriver-v0.29.1-linux64.tar.gz | tar xz -C /usr/local/bin \
&& apt-get purge -y ca-certificates curl ENV TERM xterm ENV SCRAPY_SETTINGS_MODULE <PROJECT_NAME>.settings RUN mkdir -p /app WORKDIR /app COPY ./requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r requirements.txt COPY . /app RUN python setup.py install
Replace <PROJECT_NAME> in ENV SCRAPY_SETTINGS_MODULE instruction with the actual name of the Scrapy project being deployed.
Spider Code Example for Firefox
# demo.py import scrapy from selenium import webdriver class DemoSpider(scrapy.Spider): name = 'demo' start_urls = ['http://quotes.toscrape.com/js'] def __init__(self, *args, **kwargs): super(DemoSpider, self).__init__(*args, **kwargs) options = webdriver.FirefoxOptions() options.add_argument("--window-size 1920,1080") options.add_argument("--headless") self.driver = webdriver.Firefox(options=options) def parse(self, response): self.driver.get(response.url) for quote in self.driver.find_elements_by_css_selector('div.quote'): yield { 'quote': quote.find_element_by_css_selector('span').text, 'author': quote.find_element_by_css_selector('small').text, } next_page_url = response.css('nav li.next a ::attr(href)').extract_first() if next_page_url: yield scrapy.Request(response.urljoin(next_page_url))
Quick Start Steps
- For a jump start, download https://github.com/scrapinghub/sample-projects/tree/master/sc_custom_image.
- Create a Dockerfile in sc_custom_image root folder (where scrapy.cfg is), copy/paste the content of either Dockerfile example above, and replace <PROJECT_NAME> with sc_custom_image.
- Update scrapinghub.yml with the numerical ID of the Scrapy Cloud project that will contain the spider being deployed.
- Create setup.py file, taking this one as an example: https://github.com/scrapinghub/sample-projects/blob/master/splash_crawlera_example/setup.py (omit package_data declaration line).
- Replace demo.py located at sc_custom_image/sc_custom_image/spiders with the content of either spider code example above.
- Run shub deploy.
Was this article helpful?
That’s Great!
Thank you for your feedback
Sorry! We couldn't be helpful
Thank you for your feedback
Feedback sent
We appreciate your effort and will try to fix the article