sql_data.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. """
  2. retrieve_scrape_data 入库语句。
  3. 列顺序须与 DrugPipeline.storge_data 中构建的 row 字段一致。
  4. """
  5. # 与 pipelines/drug_pipelines.py 里 row 的键顺序一致(便于核对与扩展)
  6. RETRIEVE_SCRAPE_INSERT_COLUMNS = (
  7. "platform_id",
  8. "platform_item_id",
  9. "enterprise_id",
  10. "product_name",
  11. "product_specs",
  12. "product_brand",
  13. "one_box_price",
  14. "link_url",
  15. "store_name",
  16. "store_url",
  17. "shipment_province_id",
  18. "shipment_province_name",
  19. "shipment_city_id",
  20. "shipment_city_name",
  21. "manufacturer",
  22. "company_name",
  23. "scrape_date",
  24. "is_sold_out",
  25. "min_price",
  26. "sales",
  27. "inventory",
  28. "snapshot_url",
  29. "approval_number",
  30. "expiry_date",
  31. "update_time",
  32. "insert_time",
  33. "number",
  34. "task_id",
  35. "anonymous_store_name",
  36. "search_name",
  37. "collect_config_info",
  38. "area_info",
  39. "city_name",
  40. "city_id",
  41. "province_name",
  42. "province_id",
  43. "collect_equipment_account_id",
  44. "collect_region_id",
  45. "collect_round",
  46. )
  47. def _retrieve_scrape_insert_if_absent_sql() -> str:
  48. cols = RETRIEVE_SCRAPE_INSERT_COLUMNS
  49. column_list = ", ".join(f"`{c}`" for c in cols)
  50. inner_select = ",\n ".join(f"%s AS `{c}`" for c in cols)
  51. return f"""INSERT INTO `retrieve_scrape_data` ({column_list})
  52. SELECT * FROM (
  53. SELECT
  54. {inner_select}
  55. ) AS `t`
  56. WHERE NOT EXISTS (
  57. SELECT 1 FROM `retrieve_scrape_data` AS `p`
  58. WHERE `p`.`platform_id` <=> %s
  59. AND `p`.`scrape_date` <=> %s
  60. AND `p`.`platform_item_id` <=> %s
  61. AND `p`.`collect_equipment_account_id` <=> %s
  62. AND `p`.`collect_round` <=> %s
  63. );"""
  64. sql_map = {
  65. # 仅插入:同 platform_id, scrape_date, platform_item_id, collect_equipment_account_id, collect_round 已存在则跳过
  66. "retrieve_scrape_insert_if_absent_sql": _retrieve_scrape_insert_if_absent_sql(),
  67. }