python - How to give custom name to images when downloading through scrapy -


this program download images through image pipeline. works , download images problem ** rename images in sha1 hash after unable identify them. can there solution can use **model_name of images download?

   import scrapy scrapy.contrib.spiders import crawlspider, rule scrapy.selector import selector scrapy.contrib.linkextractors.sgml import sgmllinkextractor selenium import webdriver urlparse import urljoin import time  class compitem(scrapy.item):     model_name = scrapy.field()     images = scrapy.field()     image_urls = scrapy.field()     image_name = scrapy.field()  class criticspider(crawlspider):     name = "buysmaart_images"     allowed_domains = ["http://buysmaart.com/"]     start_urls = ["http://buysmaart.com/productdetails/550/samsung-galaxy-note-4",  "http://buysmaart.com/productdetails/115/htc-one-m8-eye",  "http://buysmaart.com/productdetails/506/oppo-n1",  "http://buysmaart.com/productdetails/342/lg-g2-d802t"]      def __init__(self, *args, **kwargs):         super(criticspider, self).__init__(*args, **kwargs)         self.download_delay = 0.25         self.browser = webdriver.firefox()         self.browser.implicitly_wait(2)      def parse_start_url(self, response):         self.browser.get(response.url)         time.sleep(8)         sel = selector(text=self.browser.page_source)         item = compitem()          photos = sel.xpath('//ul[contains(@id,"productimageul")]/li')         print len(photos)         all_photo_urls = []         photo in photos:             item['image_name'] = sel.xpath('.//h3[contains(@class,"ng-binding")]/text()').extract()[0].encode('ascii','ignore')             #tmp_url = photo.xpath('.//img/@src').extract()[0].encode('ascii','ignore')             image_url = photo.xpath('.//img/@src').extract()[0]             all_photo_urls.append(image_url)             item['image_urls'] = all_photo_urls         yield item 

pipeline

    scrapy.contrib.pipeline.images import downloadimagespipeline scrapy.exceptions import dropitem scrapy.http import request class downloadimagespipeline(object):     def process_item(self, item, spider):          def get_media_requests(self, item, info):         return [request(x, meta={'image_names': item["image_name"]})                 x in item.get('image_urls', [])]  def get_images(self, response, request, info):     key, image, buf, in super(downloadimagespipeline, self).get_images(response, request, info):         if re.compile('^[0-9,a-f]+.jpg$').match(key):             key = self.change_filename(key, response)         yield key, image, buf  def change_filename(self, key, response):     return "%s.jpg" % response.meta['image_name'][0]      def item_completed(self, results, item, info):         image_paths = [x['path'] ok, x in results if ok]         if not image_paths:             raise dropitem("item contains no images")         item['image_paths'] = image_paths         return item 

settings

bot_name = 'download_images'  spider_modules = ['download_images.spiders'] newspider_module = 'download_images.spiders' item_pipelines = ['scrapy.contrib.pipeline.images.imagespipeline'] images_store= '/home/john/desktop/download_images/31_jul' 

scrapy 1.3.3 solution(override image_downloaded methods):

import scrapy scrapy.pipelines.images import imagespipeline scrapy.utils.misc import md5sum class myimagespipeline(imagespipeline):     def get_media_requests(self, item, info):         image_url in item['image_urls']:             yield scrapy.request(image_url, meta={'image_names': item["image_names"]})      def image_downloaded(self, response, request, info):         checksum = none         path, image, buf in self.get_images(response, request, info):             if checksum none:                 buf.seek(0)                 checksum = md5sum(buf)             width, height = image.size             path = 'full/%s' % response.meta['image_names'][0] # **here changed**             self.store.persist_file(                 path, buf, info,                 meta={'width': width, 'height': height},                 headers={'content-type': 'image/jpeg'})         return checksum 

Comments

Popular posts from this blog

c - Bitwise operation with (signed) enum value -

xslt - Unnest parent nodes by child node -

python - Healpy: From Data to Healpix map -