| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- """
- retrieve_scrape_data 入库语句。
- 列顺序须与 DrugPipeline.storge_data 中构建的 row 字段一致。
- """
- # 与 pipelines/drug_pipelines.py 里 row 的键顺序一致(便于核对与扩展)
- RETRIEVE_SCRAPE_INSERT_COLUMNS = (
- "platform_id",
- "platform_item_id",
- "enterprise_id",
- "product_name",
- "product_specs",
- "product_brand",
- "one_box_price",
- "link_url",
- "store_name",
- "store_url",
- "shipment_province_id",
- "shipment_province_name",
- "shipment_city_id",
- "shipment_city_name",
- "manufacturer",
- "company_name",
- "scrape_date",
- "is_sold_out",
- "min_price",
- "sales",
- "inventory",
- "snapshot_url",
- "approval_number",
- "expiry_date",
- "update_time",
- "insert_time",
- "number",
- "task_id",
- "anonymous_store_name",
- "search_name",
- "collect_config_info",
- "area_info",
- "city_name",
- "city_id",
- "province_name",
- "province_id",
- "collect_equipment_account_id",
- "collect_region_id",
- "collect_round",
- )
- def _retrieve_scrape_insert_if_absent_sql() -> str:
- cols = RETRIEVE_SCRAPE_INSERT_COLUMNS
- column_list = ", ".join(f"`{c}`" for c in cols)
- inner_select = ",\n ".join(f"%s AS `{c}`" for c in cols)
- return f"""INSERT INTO `retrieve_scrape_data` ({column_list})
- SELECT * FROM (
- SELECT
- {inner_select}
- ) AS `t`
- WHERE NOT EXISTS (
- SELECT 1 FROM `retrieve_scrape_data` AS `p`
- WHERE `p`.`platform_id` <=> %s
- AND `p`.`scrape_date` <=> %s
- AND `p`.`platform_item_id` <=> %s
- AND `p`.`collect_equipment_account_id` <=> %s
- AND `p`.`collect_round` <=> %s
- );"""
- sql_map = {
- # 仅插入:同 platform_id, scrape_date, platform_item_id, collect_equipment_account_id, collect_round 已存在则跳过
- "retrieve_scrape_insert_if_absent_sql": _retrieve_scrape_insert_if_absent_sql(),
- }
|