""" retrieve_scrape_data 入库语句。 列顺序须与 DrugPipeline.storge_data 中构建的 row 字段一致。 """ # 与 pipelines/drug_pipelines.py 里 row 的键顺序一致(便于核对与扩展) RETRIEVE_SCRAPE_INSERT_COLUMNS = ( "platform_id", "platform_item_id", "enterprise_id", "product_name", "product_specs", "product_brand", "one_box_price", "link_url", "store_name", "store_url", "shipment_province_id", "shipment_province_name", "shipment_city_id", "shipment_city_name", "manufacturer", "company_name", "scrape_date", "is_sold_out", "min_price", "sales", "inventory", "snapshot_url", "approval_number", "expiry_date", "update_time", "insert_time", "number", "task_id", "anonymous_store_name", "search_name", "collect_config_info", "area_info", "city_name", "city_id", "province_name", "province_id", "collect_equipment_account_id", "collect_region_id", "collect_round", ) def _retrieve_scrape_insert_if_absent_sql() -> str: cols = RETRIEVE_SCRAPE_INSERT_COLUMNS column_list = ", ".join(f"`{c}`" for c in cols) inner_select = ",\n ".join(f"%s AS `{c}`" for c in cols) return f"""INSERT INTO `retrieve_scrape_data` ({column_list}) SELECT * FROM ( SELECT {inner_select} ) AS `t` WHERE NOT EXISTS ( SELECT 1 FROM `retrieve_scrape_data` AS `p` WHERE `p`.`platform_id` <=> %s AND `p`.`scrape_date` <=> %s AND `p`.`platform_item_id` <=> %s AND `p`.`collect_equipment_account_id` <=> %s AND `p`.`collect_round` <=> %s );""" sql_map = { # 仅插入:同 platform_id, scrape_date, platform_item_id, collect_equipment_account_id, collect_round 已存在则跳过 "retrieve_scrape_insert_if_absent_sql": _retrieve_scrape_insert_if_absent_sql(), }