python - UnicodeEncodeError: 'ascii' codec can't encode character u'\u2013' in position 448: ordinal not in range(128) -
i using selenium python scrape linkedin data. can parse through various webpages , scrape data process interrupted after first few pages due unicode error. here's code:
from selenium import webdriver time import sleep driver = webdriver.firefox() driver.get('https://www.linkedin.com/jobs/search?locationid=sg%3a0&f_tp=1%2c2&orig=fctd&trk=jobs_jserp_posted_one_week') result = [] while true: while true: try: sleep(1) result +=[i.text in driver.find_elements_by_class_name('job-title-text')] except: sleep(5) else: break try: in range(50): nextbutton = driver.find_element_by_class_name('next-btn') nextbutton.click() except: break open('jobtitles.csv', 'w') f: f.write('\n'.join(i in result).encode('utf-8').decode('utf-8'))
you can use unicodewriter (from python docs):
import codecs import cstringio import csv time import sleep selenium import webdriver class unicodewriter: """ csv writer write rows csv file "f", encoded in given encoding. """ def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): # redirect output queue self.queue = cstringio.stringio() self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f self.encoder = codecs.getincrementalencoder(encoding)() def writerow(self, row): self.writer.writerow([s.encode("utf-8") s in row]) # fetch utf-8 output queue ... data = self.queue.getvalue() data = data.decode("utf-8") # ... , reencode target encoding data = self.encoder.encode(data) # write target stream self.stream.write(data) # empty queue self.queue.truncate(0) def writerows(self, rows): row in rows: self.writerow(row) driver = webdriver.firefox() driver.get('https://www.linkedin.com/jobs/search?locationid=sg%3a0&f_tp=1%2c2&orig=fctd&trk=jobs_jserp_posted_one_week') result = [] while true: while true: try: sleep(1) result +=[i.text in driver.find_elements_by_class_name('job-title-text')] except: sleep(5) else: break try: in range(50): nextbutton = driver.find_element_by_class_name('next-btn') nextbutton.click() except: break open('jobtitles.csv', 'w') f: doc = unicodewriter(f) doc.writerows(result)
Comments
Post a Comment