关键词

scrapy在重复爬取的时候删除掉之前爬的旧数据,在爬虫结束的时候收集统计信息

问题:想在启动scrapy后重复爬取某一天的数据,但是爬取之前需要删除掉之前的旧数据,在哪里实现删除呢?

可以在pipeline的open_spider(self,spider)中删除,则在爬虫启动的时候会删除。

以下是pipelines.py 文件

 

# -*- coding: utf-8 -*-
import sys
sys.path.append("/apps/jr_python/riskspiders")
from riskspiders.utils import DButil
from riskspiders.settings import DATABASE_PRM
import logging
import hashlib
logger = logging.getLogger(__name__)
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class RiskspidersPipeline(object):
    # 连接数据库,只要类一初始化,就已经连好了数据库
    db = DButil(DATABASE_PRM)
    def process_item(self, item, spider):
        return item

class RiskspidersMySQLPipeline(object):

    # 连接数据库,只要类一初始化,就已经连好了数据库
    # def __init__(self):
    #     self.md = hashlib.md5()
    def open_spider(self, spider):
        print("open_spider, %s" % spider.name)
        self.db = DButil(DATABASE_PRM)
        for day in spider.day_list:
            sql_del = """delete from riskinfo where spider = '{}' and release_time = '{}';""".format(spider.name,day)
            try:
                self.db.execute(sql_del)
            except Exception as e:
                print(e)

    def close_spider(self,spider):
        self.db.close()
        # 以下可以打印大部分的数据收集,但是finish_time等不能输出,因为程序还没有运行完
        print(spider.crawler.stats.get_value())

    def process_item(self,item,spider):
        db = DButil(DATABASE_PRM)

        # 逐条插入,更新插入
        if spider.name == 'hexun_bankdata':
            # print('***** item_bank insert MySQL')
            logger.info('***** item_bank insert MySQL')

            pa = (
            item["source"], item["spider"],item['website_menu'], item["disclosure_period"], item["bank_abbreviation"], item["total_assets"],
            item["capital_adequancy_ratio"], item["core_capital_adequancy_ratio"], item["bad_loan_ratio"],
            item["provision_coverage"], item["url"], item["cra_time"], item["cra_time"])
            sql_data = \
                """insert into hexun_bankdata(source,spider,website_menu,disclosure_period, bank_abbreviation,total_assets,capital_adequancy_ratio,core_capital_adequancy_ratio,bad_loan_ratio,provision_coverage,url,cra_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)  on duplicate key update  cra_time = %s;"""

            try:
                db.execute(sql_data, pa)
            except Exception as e:
                print e
                logger.error(e)
            finally:
                db.close()
        else:
            md = hashlib.md5()
            str1 = '%s%s' % (item['title'], item['content'])
            md.update(str1)
            md_value = md.hexdigest()
            # print("str1 is %s,md_value is %s" % (str1,md_value))
            logger.info('***** item_bank insert MySQL')
            params = (
            item['source'], item['spider'],item['website_menu'], item['release_time'], item['key_words'], item['neg_key_words'], item['title'].strip(),
            item['source_type'], item['f_name'], item['is_include_tbl'], item['content'].strip(), item['content_web'], item['url'],
            item['father_url'], item['cra_time'], md_value, item['cra_time']
            )
            try:
                db.execute(
                """
                insert into riskinfo
                (source, spider,website_menu, release_time, key_words,neg_key_words, title, source_type,f_name, is_include_tbl,content,content_web, url,father_url,cra_time,content_id)
                values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update  cra_time = %s;
                """, params
                )
            except Exception as e:
                print e
                logger.error(e)
            finally:
                db.close()

 

本文链接:http://task.lmcjl.com/news/6954.html

展开阅读全文