PP-StructureV3.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. # Please make sure the requests library is installed
  2. # pip install requests
  3. import base64
  4. import os
  5. import requests
  6. API_URL = "https://q2z8becfm967o4y7.aistudio-app.com/layout-parsing"
  7. TOKEN = "16455708d55afac2f074f4ae5a88fc6c7bae7920"
  8. file_path = "E:\\project\\arbitration_system\\evidence_extractor\\test\\F86-ZC1-2023-0001\\考勤表\\F86-ZC1-2023-0001-010_00.png"
  9. with open(file_path, "rb") as file:
  10. file_bytes = file.read()
  11. file_data = base64.b64encode(file_bytes).decode("ascii")
  12. headers = {
  13. "Authorization": f"token {TOKEN}",
  14. "Content-Type": "application/json"
  15. }
  16. required_payload = {
  17. "file": file_data,
  18. "fileType": 1, # For PDF documents, set `fileType` to 0; for images, set `fileType` to 1
  19. }
  20. optional_payload = {
  21. "useDocOrientationClassify": False,
  22. "useDocUnwarping": False,
  23. "useTextlineOrientation": False,
  24. "useChartRecognition": False,
  25. }
  26. payload = {**required_payload, **optional_payload}
  27. response = requests.post(API_URL, json=payload, headers=headers)
  28. print(response.status_code)
  29. assert response.status_code == 200
  30. result = response.json()["result"]
  31. print(result["layoutParsingResults"])
  32. output_dir = "output"
  33. os.makedirs(output_dir, exist_ok=True)
  34. for i, res in enumerate(result["layoutParsingResults"]):
  35. md_filename = os.path.join(output_dir, f"doc_{i}.md")
  36. with open(md_filename, "w", encoding="utf-8") as md_file:
  37. md_file.write(res["markdown"]["text"])
  38. print(f"Markdown document saved at {md_filename}")
  39. for img_path, img in res["markdown"]["images"].items():
  40. full_img_path = os.path.join(output_dir, img_path)
  41. os.makedirs(os.path.dirname(full_img_path), exist_ok=True)
  42. img_bytes = requests.get(img).content
  43. with open(full_img_path, "wb") as img_file:
  44. img_file.write(img_bytes)
  45. print(f"Image saved to: {full_img_path}")
  46. for img_name, img in res["outputImages"].items():
  47. img_response = requests.get(img)
  48. if img_response.status_code == 200:
  49. # Save image to local
  50. filename = os.path.join(output_dir, f"{img_name}_{i}.jpg")
  51. with open(filename, "wb") as f:
  52. f.write(img_response.content)
  53. print(f"Image saved to: {filename}")
  54. else:
  55. print(f"Failed to download image, status code: {img_response.status_code}")