services.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. import os
  2. import json
  3. import re
  4. from typing import List, Dict, Any
  5. from openai import OpenAI
  6. import config.config
  7. from tools.documents_extractor import DocumentReader
  8. from application_extractor.ocr_PP_StructureV3 import LayoutParserClient_application
  9. from application_extractor.rectify_OCR_result import RectifyClient_application
  10. from transcript_extractor.rectify_transcript import RectifyClient_transcript
  11. from evidence_extractor.ocr_paddle_ocr_vl import LayoutParserClient_evidence
  12. from law_rag.run import law_rag_run
  13. from backend.embedding import compute_embedding, cosine_similarity
  14. from backend.db import fetch_similar_cases, store_case_record, init_case_db
  15. from backend.text_utils import list_files_by_ext, parse_json_from_text, extract_evidence_text_from_ocr
  16. def call_deepseek_json(system_prompt: str, user_content: str, temperature: float = 0.0) -> Any:
  17. client = OpenAI(api_key=config.config.DEEPSEEK_API, base_url="https://api.deepseek.com")
  18. response = client.chat.completions.create(
  19. model="deepseek-chat",
  20. messages=[
  21. {"role": "system", "content": system_prompt},
  22. {"role": "user", "content": user_content}
  23. ],
  24. temperature=temperature,
  25. stream=False
  26. )
  27. content = response.choices[0].message.content.strip()
  28. parsed = parse_json_from_text(content)
  29. return parsed if parsed is not None else content
  30. def extract_application_text(application_dir: str) -> str:
  31. image_exts = [".png", ".jpg", ".jpeg", ".bmp", ".gif"]
  32. doc_exts = [".pdf", ".doc", ".docx"]
  33. image_files = list_files_by_ext(application_dir, image_exts)
  34. doc_files = list_files_by_ext(application_dir, doc_exts)
  35. if image_files:
  36. ocr_client = LayoutParserClient_application()
  37. raw_text = ocr_client.parse(image_files)
  38. rectify_client = RectifyClient_application()
  39. return rectify_client.extract_legal_document(raw_text)
  40. if doc_files:
  41. reader = DocumentReader()
  42. contents = [reader.process_input(path) for path in doc_files]
  43. raw_text = "\n\n".join(contents)
  44. rectify_client = RectifyClient_application()
  45. return rectify_client.extract_legal_document(raw_text)
  46. return ""
  47. def extract_transcript_text(transcript_dir: str) -> str:
  48. doc_exts = [".pdf", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".bmp", ".gif"]
  49. files = list_files_by_ext(transcript_dir, doc_exts)
  50. if not files:
  51. return ""
  52. reader = DocumentReader()
  53. contents = [reader.process_input(path) for path in files]
  54. raw_text = "\n\n".join(contents)
  55. rectify_client = RectifyClient_transcript()
  56. cleaned = rectify_client.clean_text(raw_text)
  57. return cleaned or raw_text
  58. def build_case_profile(application_text: str, transcript_text: str) -> Dict[str, Any]:
  59. system_prompt = """
  60. 你是劳动仲裁案件分析专家。请根据申请书与庭审笔录,构建案件画像。
  61. 仅输出JSON,不要包含解释或Markdown。
  62. 输出格式:
  63. {
  64. "case_profile": {
  65. "parties": "当事人信息与关系",
  66. "claims": ["仲裁请求1", "仲裁请求2"],
  67. "background": "事实与理由摘要",
  68. "timeline": ["关键时间点1", "关键时间点2"],
  69. "key_facts": ["关键事实1", "关键事实2"],
  70. "disputed_facts": ["争议事实1", "争议事实2"]
  71. }
  72. }
  73. """
  74. user_content = f"申请书:\n{application_text}\n\n庭审笔录:\n{transcript_text}"
  75. result = call_deepseek_json(system_prompt, user_content)
  76. if isinstance(result, dict) and "case_profile" in result:
  77. return result
  78. return {"case_profile": {"parties": "", "claims": [], "background": "", "timeline": [], "key_facts": [], "disputed_facts": []}}
  79. def extract_dispute_points(application_text: str, transcript_text: str) -> List[str]:
  80. system_prompt = """
  81. 你是劳动争议分析专家。请从申请书和庭审笔录中提取本案争议焦点。
  82. 如果庭审笔录中有“争议焦点”或“争议焦点为”,优先提取其内容。
  83. 仅输出JSON,不要包含解释或Markdown。
  84. 输出格式:
  85. {
  86. "dispute_points": ["争议焦点1", "争议焦点2"]
  87. }
  88. """
  89. user_content = f"申请书:\n{application_text}\n\n庭审笔录:\n{transcript_text}"
  90. result = call_deepseek_json(system_prompt, user_content)
  91. if isinstance(result, dict):
  92. points = result.get("dispute_points", [])
  93. if isinstance(points, list):
  94. return [p for p in points if isinstance(p, str) and p.strip()]
  95. text = transcript_text or application_text
  96. matches = re.findall(r"争议焦点为[::]?(.*)", text)
  97. if matches:
  98. raw = matches[0]
  99. parts = re.split(r"[;;。]\s*|\d+[、.]", raw)
  100. return [p.strip() for p in parts if p.strip()]
  101. return []
  102. def list_evidence_categories(evidence_dir: str) -> List[str]:
  103. if not evidence_dir or not os.path.exists(evidence_dir):
  104. return []
  105. entries = []
  106. for name in os.listdir(evidence_dir):
  107. full_path = os.path.join(evidence_dir, name)
  108. if os.path.isdir(full_path):
  109. entries.append(name)
  110. if entries:
  111. return sorted(entries)
  112. files = list_files_by_ext(evidence_dir, [".pdf", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".bmp", ".gif"])
  113. return ["未分类"] if files else []
  114. def select_relevant_categories(dispute_points: List[str], categories: List[str]) -> Dict[str, List[str]]:
  115. if not dispute_points or not categories:
  116. return {}
  117. system_prompt = """
  118. 你是证据分析专家。请根据争议焦点选择可能相关的证据类别。
  119. 仅输出JSON,不要包含解释或Markdown。
  120. 输出格式:
  121. {
  122. "mapping": [
  123. {"dispute_point": "争议焦点1", "categories": ["证据类别A", "证据类别B"]},
  124. {"dispute_point": "争议焦点2", "categories": ["证据类别C"]}
  125. ]
  126. }
  127. """
  128. user_content = json.dumps({"dispute_points": dispute_points, "evidence_categories": categories}, ensure_ascii=False)
  129. result = call_deepseek_json(system_prompt, user_content)
  130. mapping: Dict[str, List[str]] = {}
  131. if isinstance(result, dict):
  132. items = result.get("mapping", [])
  133. if isinstance(items, list):
  134. for item in items:
  135. point = item.get("dispute_point")
  136. cats = item.get("categories", [])
  137. if isinstance(point, str) and isinstance(cats, list):
  138. valid = [c for c in cats if c in categories]
  139. if valid:
  140. mapping[point] = valid
  141. return mapping
  142. def limit_files(files: List[str], limit: int = 10) -> List[str]:
  143. if len(files) <= limit:
  144. return files
  145. head = limit // 2
  146. tail = limit - head
  147. return files[:head] + files[-tail:]
  148. def ocr_evidence_files(files: List[str]) -> List[Dict[str, Any]]:
  149. image_exts = [".png", ".jpg", ".jpeg", ".bmp", ".gif"]
  150. doc_exts = [".pdf", ".doc", ".docx"]
  151. images = [f for f in files if os.path.splitext(f)[1].lower() in image_exts]
  152. docs = [f for f in files if os.path.splitext(f)[1].lower() in doc_exts]
  153. results = []
  154. if images:
  155. ocr_client = LayoutParserClient_evidence()
  156. text = ocr_client.parse(images)
  157. extracted = extract_evidence_text_from_ocr(text)
  158. results.append(
  159. {
  160. "files": images,
  161. "text": extracted["text"][:2000],
  162. "lines": extracted["lines"][:200]
  163. }
  164. )
  165. if docs:
  166. reader = DocumentReader()
  167. for doc in docs:
  168. text = reader.process_input(doc)
  169. results.append({"files": [doc], "text": text[:2000]})
  170. return results
  171. def retrieve_laws(dispute_points: List[str]) -> Dict[str, List[Dict[str, str]]]:
  172. laws = {}
  173. for point in dispute_points:
  174. laws[point] = law_rag_run(point)
  175. return laws
  176. def final_judgement(
  177. case_profile: Dict[str, Any],
  178. dispute_points: List[str],
  179. law_results: Dict[str, Any],
  180. evidence_results: Dict[str, Any],
  181. similar_cases: List[Dict[str, Any]]
  182. ) -> Dict[str, Any]:
  183. system_prompt = """
  184. 你是劳动争议案件裁决分析专家。基于案件画像、争议焦点、证据摘要、相关法律条文与相似案例,给出最终判断。
  185. 仅输出JSON,不要包含解释或Markdown。
  186. 输出格式:
  187. {
  188. "final_decision": "最终判断结论",
  189. "reasoning": "综合理由",
  190. "dispute_point_findings": [
  191. {
  192. "dispute_point": "争议焦点1",
  193. "finding": "对此争议的判断",
  194. "evidence_used": ["证据类别A", "证据类别B"],
  195. "law_applied": ["法律条文ID1", "法律条文ID2"]
  196. }
  197. ]
  198. }
  199. """
  200. user_content = json.dumps(
  201. {
  202. "case_profile": case_profile,
  203. "dispute_points": dispute_points,
  204. "law_results": law_results,
  205. "evidence_results": evidence_results,
  206. "similar_cases": similar_cases
  207. },
  208. ensure_ascii=False
  209. )
  210. result = call_deepseek_json(system_prompt, user_content)
  211. if isinstance(result, dict):
  212. return result
  213. return {"final_decision": "", "reasoning": "", "dispute_point_findings": []}
  214. def build_case_summary_text(case_profile: Dict[str, Any], dispute_points: List[str]) -> str:
  215. profile = case_profile.get("case_profile", {}) if isinstance(case_profile, dict) else {}
  216. parts = []
  217. parties = profile.get("parties")
  218. if parties:
  219. parts.append(str(parties))
  220. claims = profile.get("claims", [])
  221. if isinstance(claims, list) and claims:
  222. parts.append(" ".join([str(c) for c in claims]))
  223. background = profile.get("background")
  224. if background:
  225. parts.append(str(background))
  226. if dispute_points:
  227. parts.append(" ".join(dispute_points))
  228. return "\n".join([p for p in parts if p])
  229. def compute_similar_cases(embedding: List[float]) -> List[Dict[str, Any]]:
  230. raw_cases = fetch_similar_cases(embedding, top_k=50)
  231. scored = []
  232. for item in raw_cases:
  233. score = cosine_similarity(embedding, item.get("embedding", []))
  234. if score <= 0:
  235. continue
  236. try:
  237. final_judgement = json.loads(item.get("final_judgement_json") or "{}")
  238. except Exception:
  239. final_judgement = {}
  240. scored.append(
  241. {
  242. "case_id": item.get("case_id"),
  243. "summary_text": item.get("summary_text"),
  244. "final_judgement": final_judgement,
  245. "similarity": score
  246. }
  247. )
  248. scored.sort(key=lambda x: x["similarity"], reverse=True)
  249. return scored[:3]
  250. def process_case_text_with_evidence(case_id: str, application_text: str, transcript_text: str, evidence_dir: str) -> Dict[str, Any]:
  251. case_profile = build_case_profile(application_text, transcript_text)
  252. dispute_points = extract_dispute_points(application_text, transcript_text)
  253. categories = list_evidence_categories(evidence_dir)
  254. mapping = select_relevant_categories(dispute_points, categories)
  255. selected = set()
  256. for cats in mapping.values():
  257. for cat in cats:
  258. selected.add(cat)
  259. if not selected and categories:
  260. if "证据清单" in categories:
  261. selected.add("证据清单")
  262. else:
  263. selected.update(categories)
  264. evidence_results = {}
  265. for category in sorted(selected):
  266. category_path = evidence_dir if category == "未分类" else os.path.join(evidence_dir, category)
  267. files = list_files_by_ext(category_path, [".pdf", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".bmp", ".gif"])
  268. files = limit_files(files, 10)
  269. evidence_results[category] = ocr_evidence_files(files) if files else []
  270. law_results = retrieve_laws(dispute_points)
  271. summary_text = build_case_summary_text(case_profile, dispute_points)
  272. embedding = compute_embedding(summary_text)
  273. similar_cases = compute_similar_cases(embedding)
  274. judgement = final_judgement(case_profile, dispute_points, law_results, evidence_results, similar_cases)
  275. return {
  276. "case_profile": case_profile,
  277. "dispute_points": dispute_points,
  278. "evidence_results": evidence_results,
  279. "law_results": law_results,
  280. "summary_text": summary_text,
  281. "embedding": embedding,
  282. "similar_cases": similar_cases,
  283. "final_judgement": judgement
  284. }
  285. def process_case_dir(case_dir: str) -> Dict[str, Any]:
  286. init_case_db()
  287. case_id = os.path.basename(case_dir.rstrip(os.sep))
  288. application_dir = os.path.join(case_dir, "申请书")
  289. transcript_dir = os.path.join(case_dir, "庭审笔录")
  290. evidence_dir = os.path.join(case_dir, "证据")
  291. application_text = extract_application_text(application_dir)
  292. transcript_text = extract_transcript_text(transcript_dir)
  293. result = process_case_text_with_evidence(case_id, application_text, transcript_text, evidence_dir)
  294. store_case_record(
  295. case_id,
  296. result["summary_text"],
  297. result["case_profile"],
  298. result["dispute_points"],
  299. result["law_results"],
  300. result["evidence_results"],
  301. result["final_judgement"],
  302. result["embedding"]
  303. )
  304. return {
  305. "case_dir": case_dir,
  306. "application_text": application_text,
  307. "transcript_text": transcript_text,
  308. "case_profile": result["case_profile"],
  309. "dispute_points": result["dispute_points"],
  310. "law_results": result["law_results"],
  311. "evidence_results": result["evidence_results"],
  312. "final_judgement": result["final_judgement"],
  313. "similar_cases": result["similar_cases"]
  314. }