No recent searches
Popular Articles
Sorry! nothing found for
Posted almost 4 years ago by cireyennek
I am trying to deploy my spider, but I'm getting the error ImportError: No module named html. My spider looks like:
# -*- coding: utf-8 -*- import scrapy import json from datetime import datetime import datetime import lxml import re import html class JobsSpider(scrapy.Spider): name = 'abarcajobs' allowed_domains = ['greenhouse.io'] start_urls = ['https://api.greenhouse.io/v1/boards/abarca/jobs/'] def parse(self, response): listings = json.loads(response.body.decode('utf-8')) openings = [] for i in listings['jobs']: if 'Remote' in i['location']['name']: openings.append(i['id']) for jobs in openings: link = 'https://boards-api.greenhouse.io/v1/boards/abarca/jobs/' + str(jobs) yield scrapy.Request(link, callback=self.parse_listing, meta={'link': link}) def parse_listing(self, response): details = json.loads(response.body.decode('utf-8')) link = details['absolute_url'] title_raw = details['title'] title = title_raw.replace('/', '-') company = 'Abarca Health, LLC' day_scraped = str(datetime.date.today()) company_url = 'abarcahealth' title_url = re.sub('[^A-Za-z0-9-]+', '', ('-'.join(title.split())).lower()) url = title_url+'-'+company_url date_index = datetime.datetime.today() raw_description = html.unescape(details['content']) no_nbsp = raw_description.replace(' ', '') description= ''.join(no_nbsp) teaser_text = lxml.html.fromstring(description).text_content() date_index = datetime.datetime.today() yield { 'Company': company, 'Date': day_scraped, 'Link': link, 'Id': url, 'Title': title, 'TeaserText': teaser_text, 'Description': description, 'Date_index': date_index, 'Title_URL': title_url, 'Company_URL': company_url,}
My .yml file is:
projects: default: 431098 requirements: file: requirements.txt
and my requirements.txt file is:
pymongo==3.8.0 dnspython==1.15.0 html5lib==1.0.1 json2html==1.3.0
Any suggestions on what I'm doing incorrectly?
1 Votes
0 Comments
Login to post a comment
People who like this
This post will be deleted permanently. Are you sure?
I am trying to deploy my spider, but I'm getting the error ImportError: No module named html. My spider looks like:
My .yml file is:
and my requirements.txt file is:
Any suggestions on what I'm doing incorrectly?
1 Votes
0 Comments
Login to post a comment