python - How to give custom name to images when downloading through scrapy -
this program download images through image pipeline. works , download images problem ** rename images in sha1 hash after unable identify them. can there solution can use **model_name of images download?
import scrapy scrapy.contrib.spiders import crawlspider, rule scrapy.selector import selector scrapy.contrib.linkextractors.sgml import sgmllinkextractor selenium import webdriver urlparse import urljoin import time class compitem(scrapy.item): model_name = scrapy.field() images = scrapy.field() image_urls = scrapy.field() image_name = scrapy.field() class criticspider(crawlspider): name = "buysmaart_images" allowed_domains = ["http://buysmaart.com/"] start_urls = ["http://buysmaart.com/productdetails/550/samsung-galaxy-note-4", "http://buysmaart.com/productdetails/115/htc-one-m8-eye", "http://buysmaart.com/productdetails/506/oppo-n1", "http://buysmaart.com/productdetails/342/lg-g2-d802t"] def __init__(self, *args, **kwargs): super(criticspider, self).__init__(*args, **kwargs) self.download_delay = 0.25 self.browser = webdriver.firefox() self.browser.implicitly_wait(2) def parse_start_url(self, response): self.browser.get(response.url) time.sleep(8) sel = selector(text=self.browser.page_source) item = compitem() photos = sel.xpath('//ul[contains(@id,"productimageul")]/li') print len(photos) all_photo_urls = [] photo in photos: item['image_name'] = sel.xpath('.//h3[contains(@class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore') #tmp_url = photo.xpath('.//img/@src').extract()[0].encode('ascii','ignore') image_url = photo.xpath('.//img/@src').extract()[0] all_photo_urls.append(image_url) item['image_urls'] = all_photo_urls yield item
pipeline
scrapy.contrib.pipeline.images import downloadimagespipeline scrapy.exceptions import dropitem scrapy.http import request class downloadimagespipeline(object): def process_item(self, item, spider): def get_media_requests(self, item, info): return [request(x, meta={'image_names': item["image_name"]}) x in item.get('image_urls', [])] def get_images(self, response, request, info): key, image, buf, in super(downloadimagespipeline, self).get_images(response, request, info): if re.compile('^[0-9,a-f]+.jpg$').match(key): key = self.change_filename(key, response) yield key, image, buf def change_filename(self, key, response): return "%s.jpg" % response.meta['image_name'][0] def item_completed(self, results, item, info): image_paths = [x['path'] ok, x in results if ok] if not image_paths: raise dropitem("item contains no images") item['image_paths'] = image_paths return item
settings
bot_name = 'download_images' spider_modules = ['download_images.spiders'] newspider_module = 'download_images.spiders' item_pipelines = ['scrapy.contrib.pipeline.images.imagespipeline'] images_store= '/home/john/desktop/download_images/31_jul'
scrapy 1.3.3 solution(override image_downloaded
methods):
import scrapy scrapy.pipelines.images import imagespipeline scrapy.utils.misc import md5sum class myimagespipeline(imagespipeline): def get_media_requests(self, item, info): image_url in item['image_urls']: yield scrapy.request(image_url, meta={'image_names': item["image_names"]}) def image_downloaded(self, response, request, info): checksum = none path, image, buf in self.get_images(response, request, info): if checksum none: buf.seek(0) checksum = md5sum(buf) width, height = image.size path = 'full/%s' % response.meta['image_names'][0] # **here changed** self.store.persist_file( path, buf, info, meta={'width': width, 'height': height}, headers={'content-type': 'image/jpeg'}) return checksum
Comments
Post a Comment