Start a new topic

ImportError: No module named html

I am trying to deploy my spider, but I'm getting the error ImportError: No module named html. My spider looks like:

 

# -*- coding: utf-8 -*-
import scrapy
import json
from datetime import datetime
import datetime
import lxml
import re
import html

class JobsSpider(scrapy.Spider):
    name = 'abarcajobs'
    allowed_domains = ['greenhouse.io']
    start_urls = ['https://api.greenhouse.io/v1/boards/abarca/jobs/']

    def parse(self, response):
        listings = json.loads(response.body.decode('utf-8'))
        openings = []
        for i in listings['jobs']:
            if 'Remote' in i['location']['name']:
                openings.append(i['id'])

        for jobs in openings:
            link = 'https://boards-api.greenhouse.io/v1/boards/abarca/jobs/' + str(jobs)

            yield scrapy.Request(link,
                                 callback=self.parse_listing,
                                 meta={'link': link})

    def parse_listing(self, response):
        details = json.loads(response.body.decode('utf-8'))
        link = details['absolute_url']

        title_raw = details['title']
        title = title_raw.replace('/', '-')
        company = 'Abarca Health, LLC'
        day_scraped = str(datetime.date.today())
        company_url = 'abarcahealth'
        title_url = re.sub('[^A-Za-z0-9-]+', '', ('-'.join(title.split())).lower())
        url = title_url+'-'+company_url
        date_index = datetime.datetime.today()
        raw_description = html.unescape(details['content'])
        no_nbsp = raw_description.replace(' ', '')
        description= ''.join(no_nbsp)
        teaser_text = lxml.html.fromstring(description).text_content()
        date_index = datetime.datetime.today()

        yield { 'Company': company,
                'Date': day_scraped,
                'Link': link,
                'Id': url,
                'Title': title,
                'TeaserText': teaser_text,
                'Description': description,
                'Date_index': date_index,
                'Title_URL': title_url,
                'Company_URL': company_url,}

 

My .yml file is:

 

projects:
  default: 431098
requirements:
  file: requirements.txt

 and my requirements.txt file is:


pymongo==3.8.0
dnspython==1.15.0
html5lib==1.0.1
json2html==1.3.0

 Any suggestions on what I'm doing incorrectly?


1 person has this question
Login to post a comment