import os import json import re from typing import List, Dict, Any from openai import OpenAI import config.config from tools.documents_extractor import DocumentReader from application_extractor.ocr_PP_StructureV3 import LayoutParserClient_application from application_extractor.rectify_OCR_result import RectifyClient_application from transcript_extractor.rectify_transcript import RectifyClient_transcript from evidence_extractor.ocr_paddle_ocr_vl import LayoutParserClient_evidence from law_rag.run import law_rag_run from backend.embedding import compute_embedding, cosine_similarity from backend.db import fetch_similar_cases, store_case_record, init_case_db from backend.text_utils import list_files_by_ext, parse_json_from_text, extract_evidence_text_from_ocr def call_deepseek_json(system_prompt: str, user_content: str, temperature: float = 0.0) -> Any: client = OpenAI(api_key=config.config.DEEPSEEK_API, base_url="https://api.deepseek.com") response = client.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_content} ], temperature=temperature, stream=False ) content = response.choices[0].message.content.strip() parsed = parse_json_from_text(content) return parsed if parsed is not None else content def extract_application_text(application_dir: str) -> str: image_exts = [".png", ".jpg", ".jpeg", ".bmp", ".gif"] doc_exts = [".pdf", ".doc", ".docx"] image_files = list_files_by_ext(application_dir, image_exts) doc_files = list_files_by_ext(application_dir, doc_exts) if image_files: ocr_client = LayoutParserClient_application() raw_text = ocr_client.parse(image_files) rectify_client = RectifyClient_application() return rectify_client.extract_legal_document(raw_text) if doc_files: reader = DocumentReader() contents = [reader.process_input(path) for path in doc_files] raw_text = "\n\n".join(contents) rectify_client = RectifyClient_application() return rectify_client.extract_legal_document(raw_text) return "" def extract_transcript_text(transcript_dir: str) -> str: doc_exts = [".pdf", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".bmp", ".gif"] files = list_files_by_ext(transcript_dir, doc_exts) if not files: return "" reader = DocumentReader() contents = [reader.process_input(path) for path in files] raw_text = "\n\n".join(contents) rectify_client = RectifyClient_transcript() cleaned = rectify_client.clean_text(raw_text) return cleaned or raw_text def build_case_profile(application_text: str, transcript_text: str) -> Dict[str, Any]: system_prompt = """ 你是劳动仲裁案件分析专家。请根据申请书与庭审笔录,构建案件画像。 仅输出JSON,不要包含解释或Markdown。 输出格式: { "case_profile": { "parties": "当事人信息与关系", "claims": ["仲裁请求1", "仲裁请求2"], "background": "事实与理由摘要", "timeline": ["关键时间点1", "关键时间点2"], "key_facts": ["关键事实1", "关键事实2"], "disputed_facts": ["争议事实1", "争议事实2"] } } """ user_content = f"申请书:\n{application_text}\n\n庭审笔录:\n{transcript_text}" result = call_deepseek_json(system_prompt, user_content) if isinstance(result, dict) and "case_profile" in result: return result return {"case_profile": {"parties": "", "claims": [], "background": "", "timeline": [], "key_facts": [], "disputed_facts": []}} def extract_dispute_points(application_text: str, transcript_text: str) -> List[str]: system_prompt = """ 你是劳动争议分析专家。请从申请书和庭审笔录中提取本案争议焦点。 如果庭审笔录中有“争议焦点”或“争议焦点为”,优先提取其内容。 仅输出JSON,不要包含解释或Markdown。 输出格式: { "dispute_points": ["争议焦点1", "争议焦点2"] } """ user_content = f"申请书:\n{application_text}\n\n庭审笔录:\n{transcript_text}" result = call_deepseek_json(system_prompt, user_content) if isinstance(result, dict): points = result.get("dispute_points", []) if isinstance(points, list): return [p for p in points if isinstance(p, str) and p.strip()] text = transcript_text or application_text matches = re.findall(r"争议焦点为[::]?(.*)", text) if matches: raw = matches[0] parts = re.split(r"[;;。]\s*|\d+[、.]", raw) return [p.strip() for p in parts if p.strip()] return [] def list_evidence_categories(evidence_dir: str) -> List[str]: if not evidence_dir or not os.path.exists(evidence_dir): return [] entries = [] for name in os.listdir(evidence_dir): full_path = os.path.join(evidence_dir, name) if os.path.isdir(full_path): entries.append(name) if entries: return sorted(entries) files = list_files_by_ext(evidence_dir, [".pdf", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".bmp", ".gif"]) return ["未分类"] if files else [] def select_relevant_categories(dispute_points: List[str], categories: List[str]) -> Dict[str, List[str]]: if not dispute_points or not categories: return {} system_prompt = """ 你是证据分析专家。请根据争议焦点选择可能相关的证据类别。 仅输出JSON,不要包含解释或Markdown。 输出格式: { "mapping": [ {"dispute_point": "争议焦点1", "categories": ["证据类别A", "证据类别B"]}, {"dispute_point": "争议焦点2", "categories": ["证据类别C"]} ] } """ user_content = json.dumps({"dispute_points": dispute_points, "evidence_categories": categories}, ensure_ascii=False) result = call_deepseek_json(system_prompt, user_content) mapping: Dict[str, List[str]] = {} if isinstance(result, dict): items = result.get("mapping", []) if isinstance(items, list): for item in items: point = item.get("dispute_point") cats = item.get("categories", []) if isinstance(point, str) and isinstance(cats, list): valid = [c for c in cats if c in categories] if valid: mapping[point] = valid return mapping def limit_files(files: List[str], limit: int = 10) -> List[str]: if len(files) <= limit: return files head = limit // 2 tail = limit - head return files[:head] + files[-tail:] def ocr_evidence_files(files: List[str]) -> List[Dict[str, Any]]: image_exts = [".png", ".jpg", ".jpeg", ".bmp", ".gif"] doc_exts = [".pdf", ".doc", ".docx"] images = [f for f in files if os.path.splitext(f)[1].lower() in image_exts] docs = [f for f in files if os.path.splitext(f)[1].lower() in doc_exts] results = [] if images: ocr_client = LayoutParserClient_evidence() text = ocr_client.parse(images) extracted = extract_evidence_text_from_ocr(text) results.append( { "files": images, "text": extracted["text"][:2000], "lines": extracted["lines"][:200] } ) if docs: reader = DocumentReader() for doc in docs: text = reader.process_input(doc) results.append({"files": [doc], "text": text[:2000]}) return results def retrieve_laws(dispute_points: List[str]) -> Dict[str, List[Dict[str, str]]]: laws = {} for point in dispute_points: laws[point] = law_rag_run(point) return laws def final_judgement( case_profile: Dict[str, Any], dispute_points: List[str], law_results: Dict[str, Any], evidence_results: Dict[str, Any], similar_cases: List[Dict[str, Any]] ) -> Dict[str, Any]: system_prompt = """ 你是劳动争议案件裁决分析专家。基于案件画像、争议焦点、证据摘要、相关法律条文与相似案例,给出最终判断。 仅输出JSON,不要包含解释或Markdown。 输出格式: { "final_decision": "最终判断结论", "reasoning": "综合理由", "dispute_point_findings": [ { "dispute_point": "争议焦点1", "finding": "对此争议的判断", "evidence_used": ["证据类别A", "证据类别B"], "law_applied": ["法律条文ID1", "法律条文ID2"] } ] } """ user_content = json.dumps( { "case_profile": case_profile, "dispute_points": dispute_points, "law_results": law_results, "evidence_results": evidence_results, "similar_cases": similar_cases }, ensure_ascii=False ) result = call_deepseek_json(system_prompt, user_content) if isinstance(result, dict): return result return {"final_decision": "", "reasoning": "", "dispute_point_findings": []} def build_case_summary_text(case_profile: Dict[str, Any], dispute_points: List[str]) -> str: profile = case_profile.get("case_profile", {}) if isinstance(case_profile, dict) else {} parts = [] parties = profile.get("parties") if parties: parts.append(str(parties)) claims = profile.get("claims", []) if isinstance(claims, list) and claims: parts.append(" ".join([str(c) for c in claims])) background = profile.get("background") if background: parts.append(str(background)) if dispute_points: parts.append(" ".join(dispute_points)) return "\n".join([p for p in parts if p]) def compute_similar_cases(embedding: List[float]) -> List[Dict[str, Any]]: raw_cases = fetch_similar_cases(embedding, top_k=50) scored = [] for item in raw_cases: score = cosine_similarity(embedding, item.get("embedding", [])) if score <= 0: continue try: final_judgement = json.loads(item.get("final_judgement_json") or "{}") except Exception: final_judgement = {} scored.append( { "case_id": item.get("case_id"), "summary_text": item.get("summary_text"), "final_judgement": final_judgement, "similarity": score } ) scored.sort(key=lambda x: x["similarity"], reverse=True) return scored[:3] def process_case_text_with_evidence(case_id: str, application_text: str, transcript_text: str, evidence_dir: str) -> Dict[str, Any]: case_profile = build_case_profile(application_text, transcript_text) dispute_points = extract_dispute_points(application_text, transcript_text) categories = list_evidence_categories(evidence_dir) mapping = select_relevant_categories(dispute_points, categories) selected = set() for cats in mapping.values(): for cat in cats: selected.add(cat) if not selected and categories: if "证据清单" in categories: selected.add("证据清单") else: selected.update(categories) evidence_results = {} for category in sorted(selected): category_path = evidence_dir if category == "未分类" else os.path.join(evidence_dir, category) files = list_files_by_ext(category_path, [".pdf", ".doc", ".docx", ".png", ".jpg", ".jpeg", ".bmp", ".gif"]) files = limit_files(files, 10) evidence_results[category] = ocr_evidence_files(files) if files else [] law_results = retrieve_laws(dispute_points) summary_text = build_case_summary_text(case_profile, dispute_points) embedding = compute_embedding(summary_text) similar_cases = compute_similar_cases(embedding) judgement = final_judgement(case_profile, dispute_points, law_results, evidence_results, similar_cases) return { "case_profile": case_profile, "dispute_points": dispute_points, "evidence_results": evidence_results, "law_results": law_results, "summary_text": summary_text, "embedding": embedding, "similar_cases": similar_cases, "final_judgement": judgement } def process_case_dir(case_dir: str) -> Dict[str, Any]: init_case_db() case_id = os.path.basename(case_dir.rstrip(os.sep)) application_dir = os.path.join(case_dir, "申请书") transcript_dir = os.path.join(case_dir, "庭审笔录") evidence_dir = os.path.join(case_dir, "证据") application_text = extract_application_text(application_dir) transcript_text = extract_transcript_text(transcript_dir) result = process_case_text_with_evidence(case_id, application_text, transcript_text, evidence_dir) store_case_record( case_id, result["summary_text"], result["case_profile"], result["dispute_points"], result["law_results"], result["evidence_results"], result["final_judgement"], result["embedding"] ) return { "case_dir": case_dir, "application_text": application_text, "transcript_text": transcript_text, "case_profile": result["case_profile"], "dispute_points": result["dispute_points"], "law_results": result["law_results"], "evidence_results": result["evidence_results"], "final_judgement": result["final_judgement"], "similar_cases": result["similar_cases"] }