From 122b75ee3fd13808ec39d2c67920bce7652a5bb9 Mon Sep 17 00:00:00 2001 From: autolordz Date: Fri, 20 Sep 2019 18:47:36 +0800 Subject: [PATCH] seperate pyfiles and optimize group cases --- README.md | 57 +-- configure.py | 56 +++ copyinfos.py | 133 +++++++ demo_docs/conf.txt | 13 + df_progress.py | 157 ++++++++ df_transform.py | 106 ++++++ docx-content-modify.py | 801 +++-------------------------------------- getjdocs.py | 126 +++++++ globalvar.py | 70 ++++ rqlist.md | 5 + util.py | 135 +++++++ 11 files changed, 871 insertions(+), 788 deletions(-) create mode 100644 configure.py create mode 100644 copyinfos.py create mode 100644 demo_docs/conf.txt create mode 100644 df_progress.py create mode 100644 df_transform.py create mode 100644 getjdocs.py create mode 100644 globalvar.py create mode 100644 rqlist.md create mode 100644 util.py diff --git a/README.md b/README.md index 888bd01..b098bd1 100644 --- a/README.md +++ b/README.md @@ -3,26 +3,10 @@ > * 法院法务自动化批量生成邮寄单据-Legal agency postal notes automatically generate app > * 给予法务邮递人员从法务OA数据表(excel)和公开的判决书(docx)提取当事人地址内容,批量直接生成邮单。 减轻相关员负担,尤其系列案,人员多地址多,手工输入地址重复性劳动太多,信息容易错漏 - - [![](https://img.shields.io/github/release/autolordz/docx-content-modify.svg?style=popout&logo=github&colorB=ff69b4)](https://github.com/autolordz/docx-content-modify/releases) [![](https://img.shields.io/badge/github-source-orange.svg?style=popout&logo=github)](https://github.com/autolordz/docx-content-modify) [![](https://img.shields.io/github/license/autolordz/docx-content-modify.svg?style=popout&logo=github)](https://github.com/autolordz/docx-content-modify/blob/master/LICENSE) -## 目录 - - - -- [环境](#环境) -- [更新](#更新) -- [内容](#内容) -- [规则](#规则) -- [详细指南](#详细指南) -- [Licence](#licence) - - - - ## 环境 > * conda : 4.6.14 @@ -31,18 +15,17 @@ > * 组件: python-docx,pandas,StyleFrame,configparser > * 打包程序: pyinstaller - ## 更新 -【2019-6-19】 -> * 添加合并系列案功能,节省打印资源 +【2019-9-19】 -【2019-6-12】 +> * 整理合并系列案功能,优化代码 -> * 更新判决书过滤词汇 +【2019-6-19】 + +> * 添加合并系列案功能,节省打印资源 - ## 内容 - [x] 按格式重命名判决书 @@ -57,7 +40,6 @@ - [x] 按照Data表输出寄送邮单 - [x] 填充好所有信息,再次运行就能输出Data表指定邮单 - ## 规则 1. 当事人收信规则,没代理律师的每个当事人一份,有委托律师的只要寄给律师一份,多个律师寄给第一个律师,同一律所也是一份 @@ -89,21 +71,17 @@ Data表部分字段演示: 4. 【适用程序】规则(系列案用): -此处在OA表的【适用程序】填写,人为判断几个案是同一系列案的请在该字段中标注len(str)>3的唯一记号,系列案会自动合并 - -len(str)>3 = 记号多于三个字符 +此处在OA表中当事人几个案件中完全相同就合并为一个案件,发一次邮单,假如人员稍有差别,仍然按原来分开处理 例如: | 【适用程序】 | 【案号】 | | --- | --- | -| AAA | 2773 | -| 2774-2776 | 2774 | -| 2774-2776 | 2775 | -| 2774-2776 | 2776 | +| 2160、2161_集合 | 2160 | +| 2160、2161_集合 | 2161 | -5. config.txt: +5. conf.txt: ```python [config] data_xlsx = data_main.xlsx # 数据模板地址 @@ -114,20 +92,18 @@ flag_append_oa = 1 # 是否导入OA数据 flag_to_postal = 1 # 是否打印邮单 flag_check_jdocs = 0 # 是否检查用户格式,输出提示信息 flag_check_postal = 0 # 是否检查邮单格式,输出提示信息 -flag_output_log = 1 # 是否保存打印 -data_case_codes = # 指定打印案号,可接多个,示例:AAA,BBB,优先级1 +data_case_codes = # 指定打印案号,可接多个,示例:AAA号,BBB号,优先级1 data_date_range = # 指定打印数据日期范围示例:2018-09-01:2018-12-01,优先级2 -data_last_lines = 10 # 指定打印最后行数,优先级3 +data_last_lines = 3 # 指定打印最后行数,优先级3 ``` - ## 详细指南 简称: -- [A表: data_oa.xlsx,OA表自己下载,这个只是参考](./demo_docs/data_oa.xlsx) -- [B表: data_main.xlsx,会自动生成,也要修改](./demo_docs/data_main.xlsx) -- [C目录: jdocs/,判决书目录,要放下载的判决书](./demo_docs/jdocs/) -- [D文档: sheet.docx,邮单模板,按照背景生成邮单](./demo_docs/sheet.docx) +- [A表: data_oa.xlsx,OA表自己下载,这个只是参考](./demo_docs/data_oa.xlsx) +- [B表: data_main.xlsx,会自动生成,也要修改](./demo_docs/data_main.xlsx) +- [C目录: jdocs/,判决书目录,要放下载的判决书](./demo_docs/jdocs/) +- [D文档: sheet.docx,邮单模板,按照背景生成邮单](./demo_docs/sheet.docx) - [E目录: postal/,邮单目录](./demo_docs/postal/) 1. 根据 **A表** 格式,整理自己的OA表(没数据是没用的),先在OA表中修改【适用程序】(系列案),修改conf.txt文件,参考[规则](#规则),如文件丢失再次运行会生成 @@ -147,9 +123,8 @@ data_last_lines = 10 # 指定打印最后行数,优先级3 5. 第二次运行(带【诉讼代理人】) 会重复 3.4. 3.5. 3.6. -6. 小白没有python环境,可以直接下载最新的exe版本,[win7/win10(32/64))](https://github.com/autolordz/docx-content-modify/releases/download/1.0.1/exe-win7win10-8962f68c.zip),仍然需要设置config文件 +6. 小白没有python环境,可以直接下载最新的exe版本,使用前先配置conf.txt文件 - ## Licence [See Licence](https://github.com/autolordz/docx-content-modify/blob/master/LICENSE) diff --git a/configure.py b/configure.py new file mode 100644 index 0000000..eaa427e --- /dev/null +++ b/configure.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 11 11:41:46 2019 + +@author: autol +""" + +import configparser + +#%% config and default values + +def write_config(cfgfile): + cfg = configparser.ConfigParser(allow_no_value=1, + inline_comment_prefixes=('#', ';')) + + cfg['config'] = dict( + data_xlsx = 'data_main.xlsx # 数据模板地址', + data_oa_xlsx = 'data_oa.xlsx # OA数据地址', + sheet_docx = 'sheet.docx # 邮单模板地址', + flag_fill_jdocs_infos = '1 # 是否填充判决书地址', + flag_append_oa = '1 # 是否导入OA数据', + flag_to_postal = '1 # 是否打印邮单', + flag_check_jdocs = '0 # 是否检查用户格式,输出提示信息', + flag_check_postal = '0 # 是否检查邮单格式,输出提示信息', + data_case_codes = ' # 指定打印案号,可接多个,示例:AAA,BBB,优先级1', + data_date_range = ' # 指定打印数据日期范围示例:2018-09-01:2018-12-01,优先级2', + data_last_lines = '3 # 指定打印最后行数,优先级3', + ) + + with open(cfgfile, 'w',encoding='utf-8-sig') as configfile: + cfg.write(configfile) + print('>>> 重新生成配置 %s ...'%cfgfile) + return cfg['config'] + + +#%% +def read_config(cfgfile): + cfg = configparser.ConfigParser(allow_no_value=True, + inline_comment_prefixes=('#', ';')) + cfg.read(cfgfile,encoding='utf-8-sig') + ret = dict( + data_xlsx = cfg['config']['data_xlsx'], + data_oa_xlsx = cfg['config']['data_oa_xlsx'], + sheet_docx = cfg['config']['sheet_docx'], + data_case_codes = cfg['config']['data_case_codes'], + data_date_range = cfg['config']['data_date_range'], + data_last_lines = cfg['config']['data_last_lines'], + flag_fill_jdocs_infos = int(cfg['config']['flag_fill_jdocs_infos']), + flag_append_oa = int(cfg['config']['flag_append_oa']), + flag_to_postal = int(cfg['config']['flag_to_postal']), + flag_check_jdocs = int(cfg['config']['flag_check_jdocs']), + flag_check_postal = int(cfg['config']['flag_check_postal']), + ) + return ret +# return dict(cfg.items('config')) + diff --git a/copyinfos.py b/copyinfos.py new file mode 100644 index 0000000..1886645 --- /dev/null +++ b/copyinfos.py @@ -0,0 +1,133 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 11 12:08:08 2019 + +@author: autol +""" + + +#%% +import re +from collections import Counter +from util import split_list,user_to_list,save_adjust_xlsx +from globalvar import * + +#%% + +def copy_users_compare(jrow,df,errs=list(' ')): + '''copy users and check users completement + errs=['【OA无用户记录】','【用户错别字】','【字段重复】','【系列案】'] + 如下对比: + 不相交,OA无用户记录 + 判断字段重复,输出重复的内容 + 比例确定怀疑用户错别字,判别不了直接正常输出 + 判决书多于当前案件,认为是系列案 + 判决书少于当前案件,当前案件缺部分地址 + ''' + + code0 = str(df['案号']).strip() + code1 = str(df['原一审案号']).strip() + jcode = str(jrow['判决书源号']).strip() + x = Counter(user_to_list(df['当事人'])) # 当前案件 + y = Counter(list(jrow['new_adr'].keys())) # 判决书 + rxy = len(list((x&y).elements()))/len(list((x|y).elements())) + rxyx = len(list((x&y).elements()))/len(list(x.elements())) + rxyy = len(list((x&y).elements()))/len(list(y.elements())) +# print('x=',x);print('y=',y);print('rxy=',rxy) +# print('rxyx=',rxyx);print('rxyy=',rxyy) + if rxy == 0: # 不相交,完全无关 + return errs[0] + if max(x.values()) > 1 or max(y.values()) > 1: # 有字段重复 + xdu = [k for k,v in x.items() if v > 1] # 重复的内容 + ydu = [k for k,v in y.items() if v > 1] + print_log('>>> %s 用户有字段重复【%s】-【案件:%s】 vs 【判决书:%s】' + %(code0,'{0:.0%}'.format(rxy),xdu,ydu)) + return errs[2] + if rxy == 1: # 完全匹配 + return df['当事人'] + if 0 < rxy < 1: # 错别字 + dx = list((x-y).elements()) + dy = list((y-x).elements()) + xx = Counter(''.join(dx)) + yy = Counter(''.join(dy)) + rxxyy = len(list(xx&yy.keys()))/len(list(xx|yy.keys())) +# print('rxxyy=',rxxyy) + if rxxyy >= .6: + print_log('>>> %s 认为【错别字率 %s】->【案件:%s vs 判决书:%s】' + %(code0,'{0:.0%}'.format(1-rxxyy),dx,dy)) + return errs[1] + elif rxxyy >= .2: + print_log('>>> %s 认为【不好判断当正常处理【差异率 %s】vs【相同范围:%s】->【差异范围:案件:%s vs 判决书:%s】 ' + %(code0,'{0:.0%}'.format(1-rxxyy), + list((x&y).elements()), + dx,dy)) + return df['当事人'] + if rxyx > .8: + print_log('>>> %s 案件 %s人 < 判决书 %s人'%(code0,len(x),len(y))) + if jcode != code1:# 系列案 + print_log('>>> %s 认为【系列案,判决书人员 %s 多出地址】'%(code0,list((y-x).elements()))) + return errs[3] + else: + return df['当事人'] + elif rxyy > .8: + print_log('>>> %s 案件 %s人 > 判决书 %s人'%(code0,len(x),len(y))) + print_log('>>> %s 认为【当前案件人员 %s 缺地址】'%(code0,list((x-y).elements()))) + return df['当事人'] + return errs[0] + + +def copy_rows_adr1(x,n_adr): + ''' copy jdocs address to address column + 格式:['当事人','诉讼代理人','地址','new_adr','案号'] + 同时排除已有代理人的信息 + ''' + user = x['当事人'];agent = x['诉讼代理人'];adr = x['地址']; codes = x['案号'] + if not isinstance(n_adr,dict): + return adr + else: + y = split_list(r'[,,]',adr) + adr1 = y.copy() + for i,k in enumerate(n_adr): + by_agent = any([k in ag for ag in re.findall(r'[\w+、]*\/[\w+]*',agent)]) # 找到代理人格式 'XX、XX/XX_123123' + if by_agent and k in adr: # remove user's address when user with agent 用户有代理人就不要地址 + y = list(filter(lambda x:not k in x,y)) + if type(n_adr) == dict and not k in adr and k in user and not by_agent: + y += [k+adr_tag+n_adr.get(k)] # append address by rules 输出地址格式 + adr2 = y.copy() + adr = ','.join(list(filter(None, y))) + if Counter(adr1) != Counter(adr2) and adr and flag_check_jdocs: + print_log('>>> 【%s】成功复制判决书地址=>【%s】'%(codes,adr)) + return adr + +address_tmp_xlsx = 'address_tmp.xlsx' + +def copy_rows_user_func(dfj,dfo): + + '''copy users line regard adr user''' + errs = ['【OA无用户记录】','【用户错别字】','【字段重复】','【系列案】'] + + dfo['判决书源号'] = '' + + def find_source(): + print_log('\n>>> 判决书信息 | 案号=%s | 源号=%s | 判决书源号=%s'%(code0,code1,jcode)) + dfo.loc[i,'地址'] = copy_rows_adr1(dfor,n_adr) + dfo.loc[i,'判决书源号'] = jcode + + for (i,dfor) in dfo.iterrows(): + for (j,dfjr) in dfj.iterrows(): + code0 = str(dfor['案号']).strip() + code1 = str(dfor['原一审案号']).strip() + jcode = str(dfjr['判决书源号']).strip() + n_adr = dfjr['new_adr'] + if isinstance(n_adr,dict): + if not n_adr:continue# 提取jdocs字段失败 + if code1 == jcode:# 同案号,则找到内容 + find_source() ; break + else:#[::-1] # 没案号 + tag1 = copy_users_compare(dfjr,dfor,errs) + if tag1 not in errs: + find_source() ; break + else: pass + dfj = dfj.fillna('') + save_adjust_xlsx(dfj,address_tmp_xlsx,textfit=('判决书源号','new_adr')) # 保存临时提取信息 + return dfo \ No newline at end of file diff --git a/demo_docs/conf.txt b/demo_docs/conf.txt new file mode 100644 index 0000000..c6c5668 --- /dev/null +++ b/demo_docs/conf.txt @@ -0,0 +1,13 @@ +[config] +data_xlsx = data_main.xlsx # 数据模板地址 +data_oa_xlsx = data_oa.xlsx # OA数据地址 +sheet_docx = sheet.docx # 邮单模板地址 +flag_fill_jdocs_infos = 1 # 是否填充判决书地址 +flag_append_oa = 1 # 是否导入OA数据 +flag_to_postal = 1 # 是否打印邮单 +flag_check_jdocs = 0 # 是否检查用户格式,输出提示信息 +flag_check_postal = 0 # 是否检查邮单格式,输出提示信息 +data_case_codes = # 指定打印案号,可接多个,示例:AAA,BBB,优先级1 +data_date_range = # 指定打印数据日期范围示例:2018-09-01:2018-12-01,优先级2 +data_last_lines = 3 # 指定打印最后行数,优先级3 + diff --git a/df_progress.py b/df_progress.py new file mode 100644 index 0000000..48edba7 --- /dev/null +++ b/df_progress.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 11 15:11:39 2019 + +@author: autol +""" + +import os,re +from glob import glob +import pandas as pd +import util as ut +from copyinfos import copy_rows_user_func +from getjdocs import get_all_jdocs,rename_jdocs_codes +from globalvar import * + +#%% + +def df_oa_append(dfo): + '''main fill OA data into df data''' + + if flag_append_oa: + if not os.path.exists(data_oa_xlsx): + print_log('>>> 没有找到OA模板 %s...不处理!!'%data_oa_xlsx);return dfo + dfoa = pd.read_excel(data_oa_xlsx,sort=False)[titles_oa].fillna('') # only oa columns + df0 = dfo.copy() + + if '适用程序' not in dfo.columns: dfo['适用程序'] = '' # 新建一栏用于系列案 + print('>>> OA数据共%s条'%len(dfoa)) + dfoa = df_make_subset(dfoa) # subset by columns + + dfoa.rename(columns={'承办人':'主审法官'},inplace=True) + dfoa = df_read_fix(dfoa) # fix empty data columns + dfoa['add_index'] = 'new' ; dfo['add_index'] = 'old' + + ec = list(set(ut.expand_codes(dfo['案号'].to_list()))) # 展开案号 + add = dfoa[~dfoa['案号'].isin(ec)] # 新增条目 + dfo = pd.concat([dfo,add],sort=1).fillna('') + dfo.sort_values(by=['立案日期','案号'],inplace=True) +# dfo.drop_duplicates(['立案日期','案号'],keep='first',inplace=True) + df_t0 = dfo[dfo['add_index'] == 'old'] ; df_t1 = dfo[dfo['add_index'] == 'new'] + df_t1l = df_t1['立案日期'].to_list() + print_log('>>> 截取OA【%s条】-Data记录 old【%s条】-new【%s条】...'%(len(dfoa),len(df_t0),len(df_t1))) + if df_t1l: + print_log('>>> 实际添加【%s条】【%s】条共【%s】...'%(len(df_t1), + str(df_t1l[0])+':'+str(df_t1l[-1]), + len(dfo))) + save_df(df0,dfo) + return dfo,dfoa['原一审案号'].to_list() + +def merge_group_cases(dfo): + + dfn = dfo.copy() # [dfo['适用程序'].str.len()>2] + ds = dfo[['适用程序','当事人']].drop_duplicates().copy();ds # 依据'适用程序','当事人'定性系列案 + for tag1,tag2 in zip(ds['适用程序'].to_list(),ds['当事人'].to_list()): # ds是系列案标签和内容 + if tag2: + dgroup = dfo[dfo['当事人']==tag2] # 查找dfo拥有的系列案 + elif tag1: + dgroup = dfo[dfo['适用程序']==tag1] # 先查 '当事人' 后查 '适用程序' + if len(dgroup) > 1: + ss = dgroup.iloc[0].copy() # 系列案选一个 + if done_tag not in ss['适用程序']: # 处理系列案 + sn0 = dgroup['案号'].to_list() + sn = [re.search(r'\d+(?=号)|$',x).group(0) for x in sn0] + if sn[0] != sn[-1]: +# ss['案号'] = re.sub(r'\d+(?=号)','%s-%s'%(sn[0],sn[-1]),sn0[0]) + ss['案号'] = re.sub(r'\d+(?=号)','、'.join(sn),sn0[0]) + print_log('>>> 发现并合并系列案:', ss['案号']) + if not ss['适用程序']: ss['适用程序'] = '、'.join(sn) + ss['适用程序'] += done_tag + dfn = pd.concat([dfn[~dfn.isin(dgroup).all(1)], ss.to_frame().T]) #合并系列并过滤原来条目 + save_df(dfo,dfn) + return dfn + +#%% df process steps + +def fill_infos_func(dfj,dfo): + '''填充判决书内容''' + dfn = copy_rows_user_func(dfj,dfo) + rename_jdocs_codes(dfn) + return dfn + + +def df_read_fix(df): + '''fix codes remove error format 处理案号格式''' + df[['立案日期','案号','主审法官','当事人']] = df[['立案日期','案号','主审法官','当事人']].replace('',float('nan')) + df.dropna(how='any',subset=['立案日期','案号','主审法官','当事人'],inplace=True) + df['原一审案号'] = df['原一审案号'].fillna('') + df[['案号','原一审案号']] = df[['案号','原一审案号']].applymap(ut.case_codes_fix) + return df + +def df_fill_infos(dfo): + '''main fill jdocs infos''' + if len(dfo) == 0: return dfo + docs = glob(ut.parse_subpath(jdocs_path,'*.docx')) # get jdocs + if not docs: return dfo + dfj = get_all_jdocs(docs) + if len(dfj) == 0: print_log('>>> 没有找到判决书...不处理!!') ; return dfo + dfn = dfo.copy() + dfn = fill_infos_func(dfj,dfn) + if flag_fill_jdocs_infos: + save_df(dfo,dfn) + return dfn + +def save_df(df_old,df_new): # 内容相同就不管 + '''保存并对比记录''' + try: + df_old = ut.titles_resort(df_old,titles_main) + df_new = ut.titles_resort(df_new,titles_main) + pd.testing.assert_frame_equal(df_old,df_new) + print_log('\n>>> 内容没变,不用保存 ..\n') + return 0 + except Exception: # 不同则保存 + ut.save_adjust_xlsx(df_new,data_xlsx) + return 1 + +def df_make_subset(df): + + ''' + cut orgin data into subset by conditions + d_codes: 多个指定案号例如: (2018)哈哈1234号,(2018)哈哈3333号 + d_range: 2019-08-13:2019-08-27 + ''' + d_codes,d_range,d_lines = data_case_codes,data_date_range,0 + if data_last_lines: d_lines = int(data_last_lines) + + ct,dats = ut.check_time(d_range);dats +# ct,dats = check_time('2019-08-13:2019-08-27');dats + if d_codes: + dcc = ut.split_list(r'[,,;;]',d_codes) + dcc = list(filter(None,[ut.case_codes_fix(x) for x in dcc])) + df = df[df['案号'].isin(dcc) | df['原一审案号'].isin(dcc)] +# df1 = df[df['案号'].isin(dcc) | df['原一审案号'].isin(dcc)];df1 + elif ct: + print_log('\n>>> 预定读取【%s】'%d_range) + df['立案日期'] = pd.to_datetime(df['立案日期']) + df.sort_values(by=['立案日期'],inplace=True) + try: + x = dats[0] + if len(dats) == 1: + y = dats[0] # y = str(datetime.date.today()) + else: + y = dats[1] + x,y = ut.parse_datetime(x),ut.parse_datetime(y) + x1 = df['立案日期'].iloc[0].to_pydatetime() + y1 = df['立案日期'].iloc[-1].to_pydatetime() + t1 = min(x,y); t2 = max(x,y) + t1 = max(t1,x1);t2 = min(t2,y1) + date_start = t1 if t1 else x1 + date_end = t2 if t2 else y1 + df = df[(df['立案日期']>=date_start)&(df['立案日期']<=date_end)].copy() #这里数据分片有警告 + df['立案日期'] = df['立案日期'].astype(str) + except Exception as e: + print_log('>>> 日期异常',e) + elif d_lines: + df = df.tail(d_lines) + return df + diff --git a/df_transform.py b/df_transform.py new file mode 100644 index 0000000..b565d75 --- /dev/null +++ b/df_transform.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 11 17:26:56 2019 + +@author: autol +""" +import re +import pandas as pd +import util as ut +from globalvar import * + +#%% df tramsfrom functions +def clean_rows_aname(x,names): + '''Clean agent name for agent to match address's agent name''' + if names: + for name in names: + if not ut.check_cn_str(name):continue # 非中文名跳过 + if name in x: + x = name;break + x = re.sub(r'_.*','',x) + x = re.sub(path_names_clean,'',x) + return x + +def clean_rows_adr(adr): + '''clean adr format''' + y = ut.split_list(r'[,,]',adr) + if y: + y = list(map(lambda x: x if re.search(r'\/地址[::]',x) else adr_tag + x,y)) + adr = ','.join(list(filter(None, y))) + return adr + +def make_adr(adr,fix_aname=[]): + ''' + clean_aname:合并标识,此处如果没律师,则代理人就是自己 + fix_aname:修正名字错误 + Returns: + level_0 address clean_aname + 0 44 XX市XX镇XXX村 张三 + 1 44 XXX市XX区XXX B律师 + ''' + adr = adr[adr != ''] + adr = adr.str.strip().str.split(r'[,,。]',expand=True).stack() + adr = adr.str.strip().apply(lambda x:clean_rows_adr(x)) + adr = adr.str.strip().str.split(r'\/地址[::]',expand=True).fillna('') + adr.columns = ['aname','address'] + adr['clean_aname'] = adr['aname'].str.strip().apply(lambda x:clean_rows_aname(x,fix_aname)) # clean adr + adr = adr.reset_index().drop(['level_1','aname'],axis=1) + return adr + +def make_agent(agent,fix_aname=[]): + ''' + fix_aname:修正名字错误,假如律师(aname)有多个,则选择第一个律师作为合并标识(clean_aname),注意没有律师的合并就是自己(uname)做代理人 + Returns: + level_0 uname aname clean_aname + 0 44 张三 A律师_123213123 A律师 + 1 44 李四 + 2 44 王五 B律师_123123132123、C律师_123123 B律师 + ''' + agent = agent[agent != ''] + agent = agent.str.strip().str.split(r'[,,。]',expand=True).stack() #Series + agent = agent.str.strip().str.split(r'\/',expand=True).fillna('') #DataFrame + agent.columns = ['uname','aname'] + agent['clean_aname'] = agent['aname'].str.strip().apply(lambda x: clean_rows_aname(x,fix_aname)) + dd_l = agent['uname'].str.strip().str.split(r'、',expand=True).stack().to_frame(name = 'uname').reset_index() + dd_r = agent[agent.columns.difference(['uname'])].reset_index() + agent = pd.merge(dd_l,dd_r,how='outer',on=['level_0','level_1']).drop(['level_1','level_2'],axis=1).fillna('') + return agent + +def merge_user(user,agent): + '''合并后以uname为主,clean_aname是律师标识 + Returns: + level_0 uname aname clean_aname + 0 44 张三 A律师_123213123 A律师 + 2 44 王五 B律师_123123132123、C律师_123123 B律师 + ''' + return pd.merge(user,agent,how='left',on=['level_0','uname']).fillna('') + +def merge_usr_agent_adr(agent,adr): + ''' clean_aname 去除nan,保留曾用名''' + + agent['clean_aname'].replace('',float('nan'),inplace=True) + agent['clean_aname'] = agent['clean_aname'].fillna(agent['uname']).replace(path_names_clean,'') + adr['clean_aname'] = adr['clean_aname'].apply(lambda x: clean_rows_aname(x,agent['clean_aname'].tolist())) + tb = pd.merge(agent,adr,how='outer',on=['level_0','clean_aname']).fillna('') + tb.dropna(how='all',subset=['uname', 'aname'],inplace=True) + return tb + +def reclean_data(tb): + tg = tb.groupby(['level_0','clean_aname','aname','address'])['uname'].apply(lambda x: '、'.join(x.astype(str))).reset_index() + glist = tg['uname'].str.split(r'、',expand=True).stack().values.tolist() + rest = tb[tb['uname'].isin(glist) == False] + x = pd.concat([rest,tg],axis=0,sort=True) + return x + +def sort_data(x,number): + x = x[['level_0','uname','aname','address']].sort_values(by=['level_0']) + x = pd.merge(number,x,how='right',on=['level_0']).drop(['level_0'],axis=1).fillna('') + return x + +def df_check_format(x): + '''check data address and agent format with check flag''' + if x['aname']!='' and not re.search(r'[\/_]',x['aname']): + ut.print_log('>>> 记录\'%s\'---- 【诉讼代理人】格式 \'%s\' 不正确,如无请留空,请自行修改...'%(x['number'],x['aname'])) + if x['address']!='' and not re.search(r'\/地址[::]',x['address']): + ut.print_log('>>> 记录\'%s\'---- 【地址】格式 \'%s\' 不正确,如无请留空,请自行修改...'%(x['number'],x['address'])) + return x \ No newline at end of file diff --git a/docx-content-modify.py b/docx-content-modify.py index ee6c4f9..76d2729 100644 --- a/docx-content-modify.py +++ b/docx-content-modify.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright (c) 2018 Autoz https://github.com/autolordz # Permission is hereby granted, free of charge, to any person obtaining a copy @@ -18,762 +19,74 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. -# -*- coding: utf-8 -*- +""" +Created on Wed Sep 11 17:33:29 2019 + +@author: autol +""" #%% -import os,re,sys,datetime,configparser,shutil +import os,re import pandas as pd -from pandas import DataFrame, read_excel, merge, concat, set_option, to_datetime -isStyleFrame = 1 -from StyleFrame import StyleFrame, Styler -from collections import Counter from docx import Document -from glob import glob -set_option('max_colwidth',500) -set_option('max_rows', 50) -set_option('max_columns',50) - -flag_print = 0 -flag_output_log = 1 - -cfgfile = 'conf.txt' -logname = 'log.txt' -data_xlsx = 'data_main.xlsx' -data_oa_xlsx = 'data_oa.xlsx' -sheet_docx = 'sheet.docx' -address_tmp_xlsx = 'address_tmp.xlsx' -postal_path = os.path.join('.','postal') -jdocs_path = os.path.join('.','jdocs') -flag_fill_jdocs_infos = 1 -flag_append_oa = 1 -flag_to_postal = 1 -flag_check_jdocs = 0 -flag_check_postal = 0 -data_case_codes = 'AAA,BBB' -data_date_range = '2018-09-01:2018-12-01' -data_last_lines = 10 -conf_list = 0 - -#%% print_log log - -if os.path.exists(logname): - os.remove(logname) -def print_log(*args, **kwargs): - print(*args, **kwargs) - if flag_output_log: - with open(logname, "a",encoding='utf-8') as file: - print(*args, **kwargs, file=file) - else: - if os.path.exists(logname): - os.remove(logname) +#import sys +#sys.stderr = open(os.devnull, "w") # silence stderr +from globalvar import * + +#%% +import util as ut +from df_progress import df_fill_infos,df_oa_append,df_read_fix,merge_group_cases +from df_transform import df_check_format +from df_transform import make_adr,make_agent,merge_user,merge_usr_agent_adr +from df_transform import reclean_data,sort_data #%% -print_log(''' +print(''' Postal Notes Automatically Generate App -Updated on Thu Jun 19 2019 +Updated on Thu Sept 19 2019 Depends on: python-docx,pandas,StyleFrame,configparser @author: Autoz (autolordz@gmail.com) ''') -#%% config and default values - - -def set_default_value(**kwargs): - global data_date_range - data_date_range = kwargs.get('data_date_range') if kwargs.get('data_date_range') != None else '# 2018-01-01:2018-12-01' - -def write_config(): - cfg = configparser.ConfigParser(allow_no_value=True, - inline_comment_prefixes=('#', ';')) - cfg['config'] = {'data_xlsx': data_xlsx+' # 数据模板地址', - 'data_oa_xlsx': data_oa_xlsx+' # OA数据地址', - 'sheet_docx': sheet_docx+' # 邮单模板地址', - 'flag_fill_jdocs_infos': str(int(flag_fill_jdocs_infos))+' # 是否填充判决书地址', - 'flag_append_oa': str(int(flag_append_oa))+' # 是否导入OA数据', - 'flag_to_postal': str(int(flag_to_postal))+' # 是否打印邮单', - 'flag_check_jdocs': str(int(flag_check_jdocs))+' # 是否检查用户格式,输出提示信息', - 'flag_check_postal': str(int(flag_check_postal))+' # 是否检查邮单格式,输出提示信息', - 'flag_output_log': str(flag_output_log)+' # 是否保存打印', - 'data_case_codes': ' # 指定打印案号,可接多个,示例:AAA,BBB,优先级1', - 'data_date_range': ' # 指定打印数据日期范围示例:%s,优先级2'%(data_date_range), - 'data_last_lines': str(data_last_lines)+' # 指定打印最后行数,优先级3', - } - with open(cfgfile, 'w',encoding='utf-8-sig') as configfile: - cfg.write(configfile) - print_log('>>> 重新生成配置 %s ...'%cfgfile) - -def read_config(): - global data_xlsx,data_oa_xlsx,sheet_docx,address_tmp_xlsx,postal_path - global jdocs_path,data_last_lines,data_date_range,data_case_codes - global flag_fill_jdocs_infos,flag_append_oa - global flag_to_postal,flag_check_jdocs,flag_check_jdocs,flag_check_postal,flag_output_log - cfg = configparser.ConfigParser(allow_no_value=True, - inline_comment_prefixes=('#', ';')) - cfg.read(cfgfile,encoding='utf-8-sig') - data_xlsx = cfg['config']['data_xlsx'] - data_oa_xlsx = cfg['config']['data_oa_xlsx'] - sheet_docx = cfg['config']['sheet_docx'] - data_case_codes = cfg.get('config', 'data_case_codes',fallback=data_case_codes) - data_date_range = cfg.get('config', 'data_date_range',fallback=data_date_range) - data_last_lines = int(cfg.get('config','data_last_lines',fallback=data_last_lines)) - flag_fill_jdocs_infos = int(cfg.get('config', 'flag_fill_jdocs_infos',fallback=flag_fill_jdocs_infos)) - flag_append_oa = int(cfg.get('config', 'flag_append_oa',fallback=flag_append_oa)) - flag_to_postal = int(cfg.get('config', 'flag_to_postal',fallback=flag_to_postal)) - flag_check_jdocs = int(cfg.get('config', 'flag_check_jdocs',fallback=flag_check_jdocs)) - flag_check_postal = int(cfg.get('config', 'flag_check_postal',fallback=flag_check_postal)) - flag_output_log = int(cfg.get('config', 'flag_output_log',fallback=flag_output_log)) - return dict(cfg.items('config')) -#%% global variable - -titles_cn = ['立案日期','案号','当事人','诉讼代理人','地址'] -titles_en = ['datetime','number','uname','aname','address'] -titles_oa = ['立案日期','案号','原一审案号','承办人','当事人','适用程序'] -titles_main = ['立案日期','适用程序','案号','原一审案号','判决书源号','主审法官','当事人','诉讼代理人','地址',] - -path_names_clean = re.compile(r'[^A-Za-z\u4e00-\u9fa5()():]') # remain only name including old name 包括括号冒号 -search_names_phone = lambda x: re.search(r'[\w()()::]+\_\d+',x) # phone numbers -path_code_ix = re.compile(r'[((][0-9]+[))].*?号') # case numbers -adr_tag = '/地址:' - -#%% read func -def split_list(regex,L): - return list(filter(None,re.split(regex,L))) - -def user_to_list(u): - '''get name list from user string - Usage: '申请人:张xx, 被申请人:李xx, 原审被告:罗xx(又名罗aa)' - -> ['张xx', '李xx', '罗xx(又名罗aa)'] - ''' - u = split_list(r'[:、,,]',u) - return [x for x in u if not re.search(r'申请人|被申请人|原告|被告|原审被告|上诉人|被上诉人|第三人|原审诉讼地位',x)] - -def check_codes(x): - return bool(re.search(path_code_ix.pattern,str(x))) - -def case_codes_fix(x): - '''fix string with chinese codes format - Usage: 'dsfdsf(2018)中文中文248号sdfsdf' -> '(2018)中文中文248号' - ''' - x = str(x) - x = re.search(path_code_ix.pattern+r'|$',x).group().strip().replace(' ','') - x = x.replace('(','(').replace(')',')') - return x - -def parse_subpath(path,file): - '''make subpath''' - if not os.path.exists(path): - os.mkdir(path) - return os.path.join(path,file) - -def check_cn_str(x): - '''check if string contain chinese''' - return bool(re.search(r'[\u4e00-\u9fa5]',str(x))) - -def parse_datetime(date): - '''datetime transform''' - try:date = datetime.datetime.strptime(date,'%Y-%m-%d') - except ValueError:print_log('时间范围格式有误,默认选取全部日期');date = '' - return date - -def titles_trans(df_list): - '''change titles between Chinese and English''' - titles_cn2en = dict(zip(titles_cn, titles_en)) - titles_en2cn = dict(zip(titles_en, titles_cn)) - trans_cn_en = list(map(lambda x,y:(titles_cn2en if y else titles_en2cn).get(x), - df_list,list(map(check_cn_str,df_list)))) - return trans_cn_en - -def titles_trans_columns(df,titles): - '''sub-replace columns titles you want''' - titles_rest = df.drop(titles,axis=1).columns.tolist() - df = df[titles + titles_rest] - df.columns = titles_trans(titles) + titles_rest - return df - -def titles_resort(df,titles): - '''resort titles with orders''' - titles_rest = df.drop(titles,axis=1).columns.tolist() - return df[titles + titles_rest] - -def save_adjust_xlsx(df,file='test.xlsx',textfit=('当事人', '诉讼代理人', '地址'),width=60): - '''save and re-adjust excel format''' - df = df.reset_index(drop='index').fillna('') - if isStyleFrame: - StyleFrame.A_FACTOR = 5 - StyleFrame.P_FACTOR = 1.2 - sf = StyleFrame(df,Styler(wrap_text = False, shrink_to_fit=True, font_size= 12)) - if('add_index' in df.columns.tolist()): - sf.apply_style_by_indexes(indexes_to_style=sf[sf['add_index'] == 'new'], - styler_obj=Styler(bg_color='yellow'), - overwrite_default_style=False) - sf.apply_column_style(cols_to_style = textfit, - width = width, - styler_obj=Styler(wrap_text=False,shrink_to_fit=True)) - else: - sf.set_column_width_dict(col_width_dict={textfit: width}) - if len(df): - sf.to_excel(file,best_fit=sf.data_df.columns.difference(textfit).tolist()).save() - else: - sf.to_excel(file).save() - else: - df.to_excel(file,index=0) - print_log('>>> 保存文件 => 文件名 \'%s\' => 数据保存成功...' %(file)) - return df - -#%% - -def read_jdocs_table(tables): - codes = '' - for table in tables: - for row in table.rows: - for cell in row.cells: - for paragraph in cell.paragraphs: - x = paragraph.text - if re.search(path_code_ix,x) and len(x) < 25: - codes = case_codes_fix(x) - break - return codes - -def get_jdocs_infos(doc,lines = 20):# search at least 20 lines docs - '''get pre address from judgment docs, return docs pre code and address''' - adrs = {};codes = '' - try:tables = Document(doc).tables - except Exception as e: - print('读取错误 %s ,docx文档问题,请重新另存为,或关闭已打开的docx文档'%e) - return codes,adrs - if tables: codes = read_jdocs_table(tables) - paras = Document(doc).paragraphs - if not paras: - return codes,adrs - if len(paras) > 20: # 多于20行就扫描一般内容 - lines = int(len(paras)/2) - parass = paras[:lines] - for i,para in enumerate(parass): - x = para.text.strip() - if len(x) > 150: continue # 段落大于150字就跳过 - if re.search(path_code_ix,x) and len(x) < 25: - codes = case_codes_fix(x);continue # codes - cond3 = re.search(r'法定代表|诉讼|代理人|判决|律师|请求|证据|辩称|辩论|不服',x) # 跳过非人员信息 - cond4 = re.search(r'上市|省略|区别|借款|保证|签订',x) # 跳过非人员信息,模糊 - cond1 = re.search(r'(?<=[::]).*?(?=[,,。])',x) - cond2 = re.search(r'.*?[省市州县区乡镇村]',x) - if cond3:continue - if cond4:continue - if cond1 and cond2: - ''' - Todo: get user and address - Usage: '被上诉人(原审被告):张三,男,1977年7月7日出生,汉族,住XX自治区(省)XX市XX区1212。现住XX省XX市XX区3434' - -> {'张三': 'XX省XX市XX区3434'} - ''' - try: - name = re.search(r'(?<=[::]).*?(?=[,,。])|$',x).group(0).strip() - name = re.sub(r'[((][下称|原名|反诉|变更前].*?[))]','',name) # filter some special names,notice here will add some words for filter - z = split_list(r'[,,::.。]',x) - z = [re.sub(r'户[籍口]|居住|身份证|所在地|住所地?|住址?|^[现原]住?','',y) for y in z if re.search(r'.*?[省市州县区乡镇村]',y)][-1] # 几个地址选最后一个 remain only address - adr = {name:''.join(z)} - adrs.update(adr) - except Exception as e: - print_log('获取信息失败 =>',e) - return codes,adrs - -def rename_jdoc_x(doc,codes): - '''rename only judgment doc files''' - jdoc_name = os.path.join(os.path.split(doc)[0],'判决书_'+codes+'.docx') - if not codes in doc:# os.path.exists(jdoc_name) - try: - os.rename(doc,jdoc_name) - return True - except Exception as e: - print_log(e) - os.remove(doc) - return False - return False - -def get_all_jdocs(docs): - numlist=[]; nadr = [] - for doc in docs: - codes,adrs = get_jdocs_infos(doc) - if codes: - rename_jdoc_x(doc,codes) - numlist.append(codes) - nadr.append(adrs) - if flag_check_jdocs and codes: - print_log('>>> 判决书信息 【%s】-【%s人】-%s \n'%(codes,len(adrs),adrs)) - numlist = list(map(case_codes_fix,numlist)) - return DataFrame({'判决书源号':numlist,'new_adr':nadr}) - -#%% -def copy_rows_adr(x): - ''' copy jdocs address to address column''' - '''格式:['当事人','诉讼代理人','地址','new_adr','案号']''' - x[:3] = x[:3].astype(str) - user = x[0];agent = x[1];adr = x[2];n_adr = x[3];codes = x[4] - if not isinstance(n_adr,dict): - return adr - else: - y = split_list(r'[,,]',adr) - adr1 = y.copy() - for i,k in enumerate(n_adr): - by_agent = any([k in ag for ag in re.findall(r'[\w+、]*\/[\w+]*',agent)]) # 找到代理人格式 'XX、XX/XX_123123' - if by_agent and k in adr: # remove user's address when user with agent 用户有代理人就不要地址 - y = list(filter(lambda x:not k in x,y)) - if type(n_adr) == dict and not k in adr and k in user and not by_agent: - y += [k+adr_tag+n_adr.get(k)] # append address by rules 输出地址格式 - adr2 = y.copy() - adr = ','.join(list(filter(None, y))) - if Counter(adr1) != Counter(adr2) and flag_check_jdocs and adr:print_log('>>> 【%s】成功复制判决书地址=>【%s】'%(codes,adr)) - return adr - -def copy_users_compare(jrow,df,errs=list(' ')): - '''copy users and check users completement - errs=['【OA无用户记录】','【用户错别字】','【字段重复】','【系列案】'] - 如下对比: - 不相交,OA无用户记录 - 判断字段重复,输出重复的内容 - 比例确定怀疑用户错别字,判别不了直接正常输出 - 判决书多于当前案件,认为是系列案 - 判决书少于当前案件,当前案件缺部分地址 - ''' - - code0 = str(df['案号']).strip() - code1 = str(df['原一审案号']).strip() - jcode = str(jrow['判决书源号']).strip() - x = Counter(user_to_list(df['当事人'])) # 当前案件 - y = Counter(list(jrow['new_adr'].keys())) # 判决书 - rxy = len(list((x&y).elements()))/len(list((x|y).elements())) - rxyx = len(list((x&y).elements()))/len(list(x.elements())) - rxyy = len(list((x&y).elements()))/len(list(y.elements())) - if flag_print: - print('x=',x);print('y=',y);print('rxy=',rxy) - print('rxyx=',rxyx);print('rxyy=',rxyy) - if rxy == 0: # 不相交,完全无关 - return errs[0] - if max(x.values()) > 1 or max(y.values()) > 1: # 有字段重复 - xdu = [k for k,v in x.items() if v > 1] # 重复的内容 - ydu = [k for k,v in y.items() if v > 1] - print_log('>>> 用户有字段重复【%s】-【案件:%s】 vs 【判决书:%s】' - %("{0:.0%}".format(rxy),xdu,ydu)) - return errs[2] - if rxy == 1: # 完全匹配 - return df['当事人'] - if 0 < rxy < 1: # 错别字 - dx = list((x-y).elements()) - dy = list((y-x).elements()) - xx = Counter(''.join(dx)) - yy = Counter(''.join(dy)) - rxxyy = len(list(xx&yy.keys()))/len(list(xx|yy.keys())) - if flag_print:print('rxxyy=',rxxyy) - if rxxyy >= .6: - print_log('>>> 觉得有【错别字率 %s】->【案件:%s vs 判决书:%s】' - %("{0:.0%}".format(1-rxxyy),dx,dy)) - return errs[1] - elif rxxyy >= .2: - print_log('>>> 觉得不好判断当正常处理【差异率 %s】vs【相同范围:%s】->【差异范围:案件:%s vs 判决书:%s】 ' - %("{0:.0%}".format(1-rxxyy), - list((x&y).elements()), - dx,dy)) - return df['当事人'] - if rxyx > .8: - print_log('>>> 案件 %s人 < 判决书 %s人'%(len(x),len(y))) - if jcode != code1:# 系列案 - print_log('>>> 觉得是【系列案,判决书人员 %s 多出地址】'%(list((y-x).elements()))) - return errs[3] - else: - return df['当事人'] - elif rxyy > .8: - print_log('>>> 案件 %s人 > 判决书 %s人'%(len(x),len(y))) - print_log('>>> 觉得有【当前案件人员 %s 缺地址】'%(list((x-y).elements()))) - return df['当事人'] - return errs[0] - -#%% - -def save_jdocs_infos(x): - '''save remane jdocs''' - try: - x = x.fillna('') - save_adjust_xlsx(x,file=address_tmp_xlsx,textfit=('判决书源号','new_adr')) -# x.to_excel(address_tmp_xlsx,index=False) - except Exception as e: - print_log('%s <= 保存失败,请检查... %s'%(address_tmp_xlsx,e)) - - -def new_adr_format(n_adr): - y=[] - for i,k in enumerate(n_adr): - y += [k+adr_tag+n_adr.get(k)] - return (','.join(list(filter(None, y)))) - -def copy_rows_user_func(dfj,dfo): - - def copy_rows_adr1(x,n_adr): - ''' copy jdocs address to address column - 格式:['当事人','诉讼代理人','地址','new_adr','案号'] - 同时排除已有代理人的信息 - ''' - user = x['当事人'];agent = x['诉讼代理人'];adr = x['地址']; codes = x['案号'] - if not isinstance(n_adr,dict): - return adr - else: - y = split_list(r'[,,]',adr) - adr1 = y.copy() - for i,k in enumerate(n_adr): - by_agent = any([k in ag for ag in re.findall(r'[\w+、]*\/[\w+]*',agent)]) # 找到代理人格式 'XX、XX/XX_123123' - if by_agent and k in adr: # remove user's address when user with agent 用户有代理人就不要地址 - y = list(filter(lambda x:not k in x,y)) - if type(n_adr) == dict and not k in adr and k in user and not by_agent: - y += [k+adr_tag+n_adr.get(k)] # append address by rules 输出地址格式 - adr2 = y.copy() - adr = ','.join(list(filter(None, y))) - if Counter(adr1) != Counter(adr2) and flag_check_jdocs and adr:print_log('>>> 【%s】成功复制判决书地址=>【%s】'%(codes,adr)) - return adr - - '''copy users line regard adr user''' - errs = ['【OA无用户记录】','【用户错别字】','【字段重复】','【系列案】'] - - dfo['判决书源号'] = '' - - for (i,dfor) in dfo.iterrows(): - for (j,dfjr) in dfj.iterrows(): - code0 = str(dfor['案号']).strip() - code1 = str(dfor['原一审案号']).strip() - jcode = str(dfjr['判决书源号']).strip() - n_adr = dfjr['new_adr'] - if isinstance(n_adr,dict): - if not n_adr:continue# 提取jdocs字段失败 - if code1 == jcode:# 同案号,则找到内容 - print_log('\n>>> 找到信息_案号=%s__源号=%s__判决书源号=%s'%(code0,code1,jcode)) - dfo.loc[i,'地址'] = copy_rows_adr1(dfor,n_adr) - dfo.loc[i,'判决书源号'] = jcode - break - else:#[::-1] # 没案号 - tag1 = copy_users_compare(dfjr,dfor,errs) - if tag1 not in errs: - print_log('\n>>> 找到信息_案号=%s__源号=%s__判决书源号=%s'%(code0,code1,jcode)) - dfo.loc[i,'地址']= copy_rows_adr1(dfor,n_adr) - dfo.loc[i,'判决书源号'] = jcode - break - else: - pass - save_jdocs_infos(dfj) - return dfo - #%% -def rename_jdocs_codes_x(d,r,old_codes): - '''add jdoc current case codes for reference 判决书改名,包括源案号''' - if str(r[old_codes]) in str(d): - nd = os.path.join(os.path.split(d)[0],'判决书_'+str(r['案号']) +'_原_'+ str(r[old_codes]) + '.docx') - if(d == nd): - return d - try: - if os.path.exists(nd): - os.remove(nd) -# if '_原_' in d: -# shutil.copyfile(d,nd) - else: - os.rename(d,nd) - print('>>> 重命名判决书 => ',nd) - except Exception as e: - print_log(e) - return nd - return d - -def rename_jdocs_codes(dfo): - '''rename with new codes''' - old_codes='判决书源号' - docs = glob(parse_subpath(jdocs_path,'判决书_*.docx')) - df = dfo[dfo[old_codes] != ''] - if docs: - for doc in docs: - for (i,dfr) in df.iterrows(): - if check_codes(dfr[old_codes]) and str(dfr[old_codes]) in doc: - rename_jdocs_codes_x(doc,dfr,old_codes) - break - return None - -def fill_infos_func(dfj,dfo): - '''填充信息并处理系列案''' - dd = dfo[['适用程序','当事人']][dfo['适用程序'].str.len()>2].drop_duplicates().copy() - dfoo = dfo.copy() - for tag1,tag2 in zip(dd['适用程序'].to_list(),dd['当事人'].to_list()): - serise = dfo[(dfo['适用程序']==tag1)&(dfo['当事人']==tag2)] - if len(serise) > 0: - ss = serise.iloc[0].copy() - if '_集合' not in ss['适用程序']: - print_log('>>> 发现系列案:',serise['案号'].to_list()) - sn0 = serise['案号'].to_list() - sn = [re.search(r'\d+(?=号)|$',x).group(0) for x in sn0] - if sn[0] != sn[-1]: - ss['案号'] = re.sub(r'\d+(?=号)','%s-%s'%(sn[0],sn[-1]),sn0[0]) - ss['适用程序'] = ss['适用程序']+'_集合' - dfoo = pd.concat([ dfoo[~dfoo.isin(serise).all(1)], - ss.to_frame().T]) - dfo = dfoo - dfo = copy_rows_user_func(dfj,dfo) - rename_jdocs_codes(dfo) - return dfo - -#%% df process steps - -def df_read_fix(df): - '''fix codes remove error format 处理案号格式''' - df[['立案日期','案号','主审法官','当事人']] = df[['立案日期','案号','主审法官','当事人']].replace('',float('nan')) - df.dropna(how='any',subset=['立案日期','案号','主审法官','当事人'],inplace=True) - df['原一审案号'] = df['原一审案号'].fillna('') - df[['案号','原一审案号']] = df[['案号','原一审案号']].applymap(case_codes_fix) - return df - -def df_fill_infos(dfo): - '''main fill jdocs infos''' - if len(dfo) == 0:return dfo - docs = glob(parse_subpath(jdocs_path,'*.docx')) # get jdocs - if not docs:return dfo - dfj = get_all_jdocs(docs) - global dd - dd = dfj - if len(dfj) == 0: - print_log('>>> 没有找到判决书...不处理!!');return dfo - dfn = fill_infos_func(dfj,dfo) - dfn = titles_resort(dfn,titles_main) - try: - if flag_fill_jdocs_infos: - dfo = save_adjust_xlsx(dfn,data_xlsx) - except PermissionError: - print_log('>>> %s 文件已打开...填充判决书地址失败!!...请关闭并重新执行'%data_xlsx) - return dfo - -def df_make_subset(df,oa_new=0): - ''' - cut orgin data into subset by conditions - ''' - dcn = case_codes_fix(data_case_codes) - date_range = data_date_range - last_lines = data_last_lines - if dcn: # 多个指定案号例如: (2018)哈哈1234号,(2018)哈哈3333号 - df = df[df['案号'].isin(split_list('[,,;;]',dcn)) | df['原一审案号'].isin(split_list('[,,;;]',dcn))] - elif ':' in date_range: - print_log('\n>>> 预定读取【%s】'%date_range) - df['立案日期'] = to_datetime(df['立案日期']) - df.sort_values(by=['立案日期'],inplace=True) - try: - dats = date_range.split(':') - x = parse_datetime(dats[0]);y = parse_datetime(dats[1]) - x1 = df['立案日期'].iloc[0].to_pydatetime() - y1 = df['立案日期'].iloc[-1].to_pydatetime() - t1 = min(x,y); t2 = max(x,y) - t1 = max(t1,x1);t2 = min(t2,y1) - date_start = t1 if t1 else x1 - date_end = t2 if t2 else y1 - df = df[(df['立案日期']>=date_start)&(df['立案日期']<=date_end)].copy() #这里数据分片有警告 - df['立案日期'] = df['立案日期'].astype(str) - return df - except Exception as e: - print_log('>>> 日期异常',e) - elif last_lines: - df = df.tail(last_lines) - return df - -#%% -def df_oa_append(dfo): - '''main fill OA data into df data and mark new add''' - if flag_append_oa: - if not os.path.exists(data_oa_xlsx): - print_log('>>> 没有找到OA模板 %s...不处理!!'%data_oa_xlsx);return dfo - dfoa = read_excel(data_oa_xlsx,sort=False)[titles_oa].fillna('') # only oa columns - df1 = dfo.copy() - df2 = dfoa.copy() - - if '适用程序' not in dfo.columns: - dfo['适用程序'] = 0 - dfoa = df_make_subset(dfoa,oa_new=1) # subset by columns - dfoa.rename(columns={'承办人':'主审法官'},inplace=True) - dfoa = df_read_fix(dfoa) # fix empty data columns - dfoa['add_index'] = 'new' - dfo['add_index'] = 'old' - dfors = dfo['适用程序'] - - tags = list(dfors[dfors.str.len()>2&dfors.apply(lambda x:'Done' in x)].unique()) - tags = [t.replace('_集合','') for t in tags] - - for i,df2r in dfoa.iterrows(): - if df2r['适用程序'] in tags:continue - dfo = dfo.append(df2r,sort=False) - - dfo.fillna('',inplace=True) - dfo.drop_duplicates(['立案日期','案号'],keep='first',inplace=True) - - dfo.sort_values(by=['立案日期','案号'],inplace=True) - df_noa = dfo[dfo['add_index'] == 'new'] - print_log('>>> 所有OA记录【%s条】...'%len(dfoa)) - print_log('>>> 原Data记录【%s条】...'%len(dfo)) - print_log('>>> 实际添加【%s条】新OA记录...'%len(df_noa)) - if len(df_noa): - dd = str(df_noa['立案日期'].iloc[0]) +':'+df_noa['立案日期'].iloc[-1] - print_log('>>> 实际添加【%s】'%dd) - if any(dfo['add_index'] == 'new'): - dfo = titles_resort(dfo,titles_main) - try:dfo = save_adjust_xlsx(dfo,data_xlsx) - except PermissionError:print_log('>>> %s 文件已打开...填充OA数据失败!!。。。请关闭并重新执行'%data_xlsx) - return dfo - -def df_check_format(x): - '''check data address and agent format with check flag''' - if x['aname']!='' and not re.search(r'[\/_]',x['aname']): - print_log('>>> 记录\'%s\'---- 【诉讼代理人】格式 \'%s\' 不正确,如无请留空,请自行修改...'%(x['number'],x['aname'])) - if x['address']!='' and not re.search(r'\/地址[::]',x['address']): - print_log('>>> 记录\'%s\'---- 【地址】格式 \'%s\' 不正确,如无请留空,请自行修改...'%(x['number'],x['address'])) - return x - - - -#%% df tramsfrom functions -def clean_rows_aname(x,names): - '''Clean agent name for agent to match address's agent name''' - if names: - for name in names: - if not check_cn_str(name):continue - if name in x: -# if flag_print: print('A=%s,B=%s'%(x,name)) - x = name;break - x = re.sub(r'_.*','',x) - x = re.sub(path_names_clean,'',x) - return x - -def clean_rows_adr(adr): - '''clean adr format''' - y = split_list(r'[,,]',adr) - if y: - y = list(map(lambda x: x if re.search(r'\/地址[::]',x) else adr_tag + x,y)) - adr = ','.join(list(filter(None, y))) - return adr - -def make_adr(adr,fix_aname=[]): - ''' - clean_aname:合并标识,此处如果没律师,则代理人就是自己 - fix_aname:修正名字错误 - Returns: - level_0 address clean_aname - 0 44 XX市XX镇XXX村 张三 - 1 44 XXX市XX区XXX B律师 - ''' - adr = adr[adr != ''] - adr = adr.str.strip().str.split(r'[,,。]',expand=True).stack() - adr = adr.str.strip().apply(lambda x:clean_rows_adr(x)) - adr = adr.str.strip().str.split(r'\/地址[::]',expand=True).fillna('') - adr.columns = ['aname','address'] - adr['clean_aname'] = adr['aname'].str.strip().apply(lambda x:clean_rows_aname(x,fix_aname)) # clean adr - adr = adr.reset_index().drop(['level_1','aname'],axis=1) - return adr - -def make_agent(agent,fix_aname=[]): - ''' - fix_aname:修正名字错误,假如律师(aname)有多个,则选择第一个律师作为合并标识(clean_aname),注意没有律师的合并就是自己(uname)做代理人 - Returns: - level_0 uname aname clean_aname - 0 44 张三 A律师_123213123 A律师 - 1 44 李四 - 2 44 王五 B律师_123123132123、C律师_123123 B律师 - ''' - agent = agent[agent != ''] - agent = agent.str.strip().str.split(r'[,,。]',expand=True).stack() #Series - agent = agent.str.strip().str.split(r'\/',expand=True).fillna('') #DataFrame - agent.columns = ['uname','aname'] - agent['clean_aname'] = agent['aname'].str.strip().apply(lambda x: clean_rows_aname(x,fix_aname)) - dd_l = agent['uname'].str.strip().str.split(r'、',expand=True).stack().to_frame(name = 'uname').reset_index() - dd_r = agent[agent.columns.difference(['uname'])].reset_index() - agent = merge(dd_l,dd_r,how='outer',on=['level_0','level_1']).drop(['level_1','level_2'],axis=1).fillna('') - return agent - -def merge_user(user,agent): - '''合并后以uname为主,clean_aname是律师标识 - Returns: - level_0 uname aname clean_aname - 0 44 张三 A律师_123213123 A律师 - 2 44 王五 B律师_123123132123、C律师_123123 B律师 - ''' - return merge(user,agent,how='left',on=['level_0','uname']).fillna('') - -def merge_usr_agent_adr(agent,adr): - ''' clean_aname 去除nan,保留曾用名 - ''' - agent['clean_aname'].replace('',float('nan'),inplace=True) - agent['clean_aname'] = agent['clean_aname'].fillna(agent['uname']).replace(path_names_clean,'') - adr['clean_aname'] = adr['clean_aname'].apply(lambda x: clean_rows_aname(x,agent['clean_aname'].tolist())) - tb = merge(agent,adr,how='outer',on=['level_0','clean_aname']).fillna('') - tb.dropna(how='all',subset=['uname', 'aname'],inplace=True) - return tb - -def reclean_data(tb): - tg = tb.groupby(['level_0','clean_aname','aname','address'])['uname'].apply(lambda x: '、'.join(x.astype(str))).reset_index() - glist = tg['uname'].str.split(r'、',expand=True).stack().values.tolist() - rest = tb[tb['uname'].isin(glist) == False] - x = concat([rest,tg],axis=0,sort=True) - return x - -def sort_data(x,number): - x = x[['level_0','uname','aname','address']].sort_values(by=['level_0']) - x = merge(number,x,how='right',on=['level_0']).drop(['level_0'],axis=1).fillna('') - return x - -#%% main processing stream 主数据流程 - -try: - if not os.path.exists(cfgfile): - '''生成默认配置''' - write_config() - conf_list = read_config() -except Exception as e: - print_log('>>> 配置文件出错 %s ,删除...'%e) - if os.path.exists(cfgfile): - os.remove(cfgfile) - try: - write_config() - conf_list = read_config() - except Exception as e: - '''这里可以添加配置问题预判问题''' - print_log('>>> 配置文件再次生成失败 %s ...'%e) - set_default_value(data_date_range = '') - print_log('''>>> 正在处理... 主表路径 = %s 指定案件 = %s 指定日期 = %s 指定条数 = %s '''%(os.path.abspath(data_xlsx), - conf_list.get('data_case_codes'), - conf_list.get('data_date_range'), - conf_list.get('data_last_lines'), + data_case_codes, + data_date_range, + data_last_lines, ) ) - if not os.path.exists(data_xlsx): - save_adjust_xlsx(DataFrame(columns=titles_main),data_xlsx,width=40) - print_log('>>> %s 记录文件不存在...重新生成'%(data_xlsx)) - -dfo = read_excel(data_xlsx,sort=False).fillna('') #真正读取记录位置 -dfo = df_read_fix(dfo) # fix empty data columns -dfo = df_oa_append(dfo) # append oa data - -dfo = df_fill_infos(dfo) # fill jdocs infos -dfo = df_make_subset(dfo) -df = titles_trans_columns(dfo,titles_cn) # 中译英方便后面处理 + ut.save_adjust_xlsx(pd.DataFrame(columns=titles_main),data_xlsx,width=40) + print_log('>>> %s 文件不存在...重新生成'%(data_xlsx)) +#%% +df = pd.read_excel(data_xlsx,sort=False).fillna('') #真正读取记录位置 +df = df_read_fix(df);df # fix empty data columns +df,ocodes = df_oa_append(df) # append oa data and sava # 合并前记录案号 +df = merge_group_cases(df);df # merge group and save +df = df[df['原一审案号'].isin(ocodes)];df # 合并后找回记录案号 +df = df_fill_infos(df) # 填充判决书内容 # filled and save +df = ut.titles_trans_columns(df,titles_cn);df # 中译英方便后面处理 if flag_check_postal: df.apply(lambda x:df_check_format(x), axis=1) - -print_log('\n>>> ***将要打印Data记录【---%s条----】...'%len(df)) if 0>> ***将要打印 => %s '%df['number'].to_list()) + print_log('>>> 将要打印【%s条】=> %s '%(len(df), + df['number'].to_list())) #%% df tramsfrom stream 数据转换流程 -if len(df) and flag_to_postal: +if len(df) and flag_to_postal: try: print_log('\n>>> 开始生成新数据 data_main_temp... ') '''获取 datetime|number''' @@ -820,22 +133,19 @@ def sort_data(x,number): print_log('>>> 缺失【代理人】和【地址】...正在处理...') agent_adr.index.name = 'level_0' agent_adr.reset_index(inplace=True) - df_x = merge(user,agent_adr,how='left',on=['level_0']).fillna('') + df_x = pd.merge(user,agent_adr,how='left',on=['level_0']).fillna('') df_x = sort_data(df_x,number) if len(df_x): data_tmp = os.path.splitext(data_xlsx)[0]+"_tmp.xlsx" df_save = df_x.copy() - df_save.columns = titles_trans(df_save.columns.tolist()) - try:df_save = save_adjust_xlsx(df_save,data_tmp,width=40) - except PermissionError: print_log('>>> %s 文件已打开...请手动关闭并重新执行...保存失败'%data_tmp) + df_save.columns = ut.titles_switch(df_save.columns.tolist()) + df_save = ut.save_adjust_xlsx(df_save,data_tmp,width=40) - except Exception as e: - raise e - print_log('>>> 错误 \'%s\' 生成数据失败,请检查源 \'%s\' 文件...退出...'%(e,data_xlsx));sys.exit() + input_exit('>>> 错误 \'%s\' 生成数据失败,请检查源 \'%s\' 文件...退出...'%(e,data_xlsx)) -#%% generate postal sheets 生成邮单流程 +#%% print postal sheets 打印邮单流程 def re_write_text(x): '''re-write postal sheet content from df rows''' @@ -848,6 +158,7 @@ def re_write_text(x): number_text = str(x['number']) address_text = str(x['address']) + # 以下填充均对于模板sheet.doc try: para = doc.paragraphs[9] # No.9 line is agent name text = re.sub(r'[\w()()]+',agent_text,para.text) @@ -857,8 +168,9 @@ def re_write_text(x): text = re.sub(r'代 \w+',user_text,para.text) para.clear().add_run(text) + para = doc.paragraphs[13] # No.13 line is number and address - text = re.sub(path_code_ix,number_text,para.text) + text = re.sub(ut.path_code_ix,number_text,para.text) para.clear().add_run(text) text = re.sub(r'(?<=\s)\w+市.*',address_text,para.text) para.clear().add_run(text) @@ -868,44 +180,39 @@ def re_write_text(x): sheet_file = number_text+'_'+agent_text+'_'+user_text+'_'+address_text+'.docx' sheet_file = re.sub(r'[\/\\\:\*\?\"\<\>]',' ',sheet_file) # keep rename legal - if os.path.exists(parse_subpath(postal_path,sheet_file)): - if flag_check_postal:print_log('>>> 邮单已存在!!! <= %s'%sheet_file) + if os.path.exists(ut.parse_subpath(postal_path,sheet_file)): + if ut.flag_check_postal:print_log('>>> 邮单已存在!!! <= %s'%sheet_file) return '' if not agent_text: if flag_check_postal:print_log('>>> 【代理人】暂缺!!! <= %s'%sheet_file) return '' - + if not address_text: if flag_check_postal:print_log('>>> 【地址】暂缺!!! <= %s'%sheet_file) return '' try: - doc.save(parse_subpath(postal_path,sheet_file)) + doc.save(ut.parse_subpath(postal_path,sheet_file)) print_log('>>> 已生成邮单 => %s'%sheet_file) return sheet_file except Exception as e: - print_log('>>> 生成失败!!! => %s'%e) + input_exit('>>> 生成失败!!! => %s ...任意键退出'%e) return '' + if len(df) and flag_to_postal: - print_log('\n>>> 正在输出邮单...') + print_log('\n>>> 正在输出邮单...\n') if not os.path.exists(sheet_docx): - input('>>> 没有找到邮单模板 %s...任意键退出'%sheet_docx);sys.exit() + input_exit('>>> 没有找到邮单模板 %s...任意键退出'%sheet_docx) df_p = df_x.apply(re_write_text,axis = 1) count = len(df_p[df_p != '']) codes = df_x['number'].astype(str) dates = df_x['datetime'].astype(str) codesrange = codes.iloc[0] if codes.iloc[0] == codes.iloc[-1] else ('%s:%s'%(codes.iloc[0],codes.iloc[-1])) datesrange = dates.iloc[0] if dates.iloc[0] == dates.iloc[-1] else ('%s:%s'%(dates.iloc[0],dates.iloc[-1])) - print_log('>>> 最终生成邮单【%s条】范围: 【%s】日期:【%s】'%(count,codesrange,datesrange)) - + print_log('\n>>> 最终生成邮单【%s条】范围: 【%s】日期:【%s】'%(count,codesrange,datesrange)) + del df_x,df_p,codes,dates del user,number,agent,adr,df,agent_adr,opt - -#%% main finish 结束所有 - -print_log('>>> 全部完成,可以回顾记录...任意键退出') -#input('>>> 全部完成,可以回顾记录...任意键退出');sys.exit() - - +input_exit('>>> 全部完成,可以回顾记录...任意键退出') diff --git a/getjdocs.py b/getjdocs.py new file mode 100644 index 0000000..6f07bba --- /dev/null +++ b/getjdocs.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 11 11:44:28 2019 + +@author: autol +""" + +import os,re +from glob import glob +import pandas as pd +from docx import Document +from util import check_codes,case_codes_fix,split_list,parse_subpath +from globalvar import * + +#%% 读取判决书jdocs代码 + +def read_jdocs_table(tables): + codes = '' + for table in tables: + for row in table.rows: + for cell in row.cells: + for paragraph in cell.paragraphs: + x = paragraph.text + if re.search(path_code_ix,x) and len(x) < 25: + codes = case_codes_fix(x) + break + return codes + +def get_jdocs_infos(doc,lines = 20):# search at least 20 lines docs + '''get pre address from judgment docs, return docs pre code and address''' + adrs = {};codes = '' + try:tables = Document(doc).tables + except Exception as e: + print_log('读取错误 %s ,docx文档问题,请重新另存为,或关闭已打开的docx文档'%e) + return codes,adrs + if tables: codes = read_jdocs_table(tables) + paras = Document(doc).paragraphs + if not paras: + return codes,adrs + if len(paras) > 20: # 多于20行就扫描一半内容 + lines = int(len(paras)/2) + parass = paras[:lines] + for i,para in enumerate(parass): + x = para.text.strip() + if len(x) > 150: continue # 段落大于150字就跳过 + if re.search(path_code_ix,x) and len(x) < 25: + codes = case_codes_fix(x);continue # codes + cond3 = re.search(r'法定代表|诉讼|代理人|判决|律师|请求|证据|辩称|辩论|不服',x) # 跳过非人员信息 + cond4 = re.search(r'上市|省略|区别|借款|保证|签订',x) # 跳过非人员信息,模糊 + cond1 = re.search(r'(?<=[::]).*?(?=[,,。])',x) # 通过间隔提取 + cond2 = re.search(r'.*?[省市州县区乡镇村]',x) # 地址规则 + if cond3:continue + if cond4:continue + if cond1 and cond2: + ''' + Todo: get user and address + Usage: '被上诉人(原审被告):张三,男,1977年7月7日出生,汉族,住XX自治区(省)XX市XX区1212。现住XX省XX市XX区3434' + -> {'张三': 'XX省XX市XX区3434'} + ''' + try: + name = re.search(r'(?<=[::]).*?(?=[,,。])|$',x).group(0).strip() + name = re.sub(r'[((][下称|原名|反诉|变更前].*?[))]','',name) # filter some special names,notice here will add some words for filter + z = split_list(r'[,,::.。]',x) + z = [re.sub(r'户[籍口]|居住|身份证|所在地|住所地?|住址?|^[现原]住?','',y) for y in z if re.search(r'.*?[省市州县区乡镇村]',y)][-1] # 几个地址选最后一个 remain only address + adr = {name:''.join(z)} + adrs.update(adr) + except Exception as e: + print_log('获取信息失败 =>',e) + return codes,adrs + +def rename_jdoc_x(doc,codes): + '''rename only judgment doc files''' + jdoc_name = os.path.join(os.path.split(doc)[0],'判决书_'+codes+'.docx') + if not codes in doc:# os.path.exists(jdoc_name) + try: + os.rename(doc,jdoc_name) + return 1 + except Exception as e: + print_log(e) + os.remove(doc) + return 0 + return 0 + +def get_all_jdocs(docs): + '''主要获取的入口''' + numlist=[]; nadr = [] + for doc in docs: + codes,adrs = get_jdocs_infos(doc) + if codes: + rename_jdoc_x(doc,codes) + numlist.append(codes) + nadr.append(adrs) + if flag_check_jdocs and codes: + print_log('>>> 判决书信息 【%s】-【%s人】-%s \n'%(codes,len(adrs),adrs)) + numlist = list(map(case_codes_fix,numlist)) + return pd.DataFrame({'判决书源号':numlist,'new_adr':nadr}) + + +def rename_jdocs_codes_x(d,r,old_codes): + '''add jdoc current case codes for reference 判决书改名,包括源案号''' + if str(r[old_codes]) in str(d): + nd = os.path.join(os.path.split(d)[0],'判决书_'+str(r['案号']) +'_原_'+ str(r[old_codes]) + '.docx') + if(d == nd): # 相同则返回 + return d + try: # 不同则命名,检测源文件存在 + if os.path.exists(nd): + os.remove(nd) + os.rename(d,nd) + print('>>> 重命名判决书 => ',nd) + except Exception as e: + print_log(e) + return nd + return d + +def rename_jdocs_codes(dfo): + '''rename jdocs with new codes''' + old_codes='判决书源号' + docs = glob(parse_subpath(jdocs_path,'判决书_*.docx')) + df = dfo[dfo[old_codes] != ''] + if docs: + for doc in docs: + for (i,dfr) in df.iterrows(): + if check_codes(dfr[old_codes]) and str(dfr[old_codes]) in doc: + rename_jdocs_codes_x(doc,dfr,old_codes) + break + return None \ No newline at end of file diff --git a/globalvar.py b/globalvar.py new file mode 100644 index 0000000..0192a7c --- /dev/null +++ b/globalvar.py @@ -0,0 +1,70 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 11 15:23:05 2019 + +@author: autol +""" +#%% +import os,re,sys +import pandas as pd +from configure import write_config,read_config + +#%% +pd.set_option('max_colwidth',500) +pd.set_option('max_rows', 50) +pd.set_option('max_columns',50) + +#%% global variable + +titles_main = ['立案日期','适用程序','案号','原一审案号','判决书源号','主审法官','当事人','诉讼代理人','地址',] +titles_oa = ['立案日期','案号','原一审案号','承办人','当事人','适用程序'] +titles_cn = ['立案日期','案号','当事人','诉讼代理人','地址'] +titles_en = ['datetime','number','uname','aname','address'] +path_names_clean = re.compile(r'[^A-Za-z\u4e00-\u9fa5()():]') # remain only name including old name 包括括号冒号 +search_names_phone = lambda x: re.search(r'[\w()()::]+\_\d+',x) # phone numbers +path_code_ix = re.compile(r'[((][0-9]+[))].*?号') # case numbers +postal_path = os.path.join('.','postal') +jdocs_path = os.path.join('.','jdocs') +adr_tag = '/地址:' +done_tag = '_集合' +usrtag = r'申请人|被申请人|原告|被告|原审被告|上诉人|被上诉人|第三人|原审诉讼地位|申请再审人|被申请再审人' # 当事人抬头标识 + + +#%% print_log log + +logname = 'log.txt' + +def print_log(*args, **kwargs): + print(*args, **kwargs) + with open(logname, "a",encoding='utf-8') as file: + print(*args, **kwargs, file=file) + +def input_exit(*args, **kwargs): + input(*args, **kwargs);sys.exit() + return 1 + +if os.path.exists(logname): + os.remove(logname) + +#%% read configure global variable + +def init_var(): + cfgfile = 'conf.txt' + try: + if not os.path.exists(cfgfile): write_config(cfgfile) # 生成默认配置 + var = pd.Series(read_config(cfgfile));var + except Exception as e: + print_log('>>> 配置文件出错 %s ,删除...'%e) + if os.path.exists(cfgfile): + os.remove(cfgfile) + try: + write_config() + var = pd.Series(read_config());var + except Exception as e: + '''这里可以添加配置问题预判问题''' + input_exit('>>> 配置文件再次生成失败 %s ...'%e) +# print(var) + return var + +var = init_var() +locals().update(var.to_dict()) # 设置读取的全局变量 diff --git a/rqlist.md b/rqlist.md new file mode 100644 index 0000000..106295a --- /dev/null +++ b/rqlist.md @@ -0,0 +1,5 @@ +numpy==1.16.2 +pandas==0.24.2 +python-docx==0.8.10 +StyleFrame==2.0.3 +PyInstaller==3.4 diff --git a/util.py b/util.py new file mode 100644 index 0000000..84be0e1 --- /dev/null +++ b/util.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Sep 11 11:29:47 2019 + +@author: autol +""" + +import os,re,datetime +from StyleFrame import StyleFrame, Styler +from globalvar import * + +#%% base utils + +def split_list(regex,L): + return list(filter(None,re.split(regex,L))) + +def user_to_list(u): + '''get name list from user string + Usage: '申请人:张xx, 被申请人:李xx, 原审被告:罗xx(又名罗aa)' + -> ['张xx', '李xx', '罗xx(又名罗aa)'] + ''' + u = split_list(r'[:、,,]',u) + return [x for x in u if not re.search(usrtag,x)] + +def check_codes(x): + '''check cases codes here''' + return bool(re.search(path_code_ix.pattern,str(x))) + +def case_codes_fix(x): + '''fix string with chinese codes format + Usage: 'dsfdsf(2018)中文中文248号sdfsdf' -> '(2018)中文中文248号' + ''' + x = str(x) + x = re.search(path_code_ix.pattern+r'|$',x).group().strip().replace(' ','') + x = x.replace('(','(').replace(')',')') + return x + +def expand_codes(xxx): + ''' + 对于系列案号处理,展开案号 + Usage: ['(2018)中文中文111、248号','(2018)中文中文333、444号'] + -> ['(2018)中文中文111号', '(2018)中文中文248号', '(2018)中文中文333号', '(2018)中文中文444号'] ''' + cc =[] + for xx in xxx: + aa = re.split('、',re.search(r'\d+、.*\d+(?=号)|$',xx).group(0)) + bb = [re.sub(r'\d+、.*\d+(?=号)',x,xx) for x in aa] + cc+=bb + return cc + +def parse_subpath(path,file): + '''make subpath''' + if not os.path.exists(path): + os.mkdir(path) + return os.path.join(path,file) + +def check_cn_str(x): + '''check if string contain chinese''' + return bool(re.search(r'[\u4e00-\u9fa5]',str(x))) + +def parse_datetime(date): + '''datetime transform''' + try:date = datetime.datetime.strptime(date,'%Y-%m-%d') + except ValueError:print_log('时间范围格式有误,默认选取全部日期');date = '' + return date + +def titles_switch(df_list): + '''switch titles between Chinese and English''' + titles_cn2en = dict(zip(titles_cn, titles_en)) + titles_en2cn = dict(zip(titles_en, titles_cn)) + trans_cn_en = list(map(lambda x,y:(titles_cn2en if y else titles_en2cn).get(x), + df_list,list(map(check_cn_str,df_list)))) + return trans_cn_en + +def titles_trans_columns(df,titles): + '''sub-replace columns titles you want''' + titles_rest = df.drop(titles,axis=1).columns.tolist() + df = df[titles + titles_rest] + df.columns = titles_switch(titles) + titles_rest + return df + +def titles_resort(df,titles): + '''resort titles with orders''' + titles_rest = df.drop(titles,axis=1).columns.tolist() + return df[titles + titles_rest] + +#%% read func + +isStyleFrame = 1 + +def save_adjust_xlsx(df,file,textfit=('当事人', '诉讼代理人', '地址'),width=60): + '''save and re-adjust excel format + with StyleFrame or not + ''' + try: + print_log('>>> 保存文件 => 文件名 \'%s\''%file) + df = df.reset_index(drop='index').fillna('') + if isStyleFrame: + StyleFrame.A_FACTOR = 5 + StyleFrame.P_FACTOR = 1.2 + sf = StyleFrame(df,Styler(wrap_text = False, shrink_to_fit=True, font_size= 12)) + if('add_index' in df.columns.tolist()): + sf.apply_style_by_indexes(indexes_to_style=sf[sf['add_index'] == 'new'], + styler_obj=Styler(bg_color='yellow'), + overwrite_default_style=False) + sf.apply_column_style(cols_to_style = textfit, + width = width, + styler_obj=Styler(wrap_text=False,shrink_to_fit=True)) + else: + sf.set_column_width_dict(col_width_dict={textfit: width}) + if len(df): + sf.to_excel(file,best_fit=sf.data_df.columns.difference(textfit).tolist()).save() + else: + sf.to_excel(file).save() + else: + df.to_excel(file,index=0) + except PermissionError: + print_log('!!!!!%s被占用,不能覆盖记录!!!!!'%file) + return df + +def check_time(dlist): + '''split and check configure times''' + if dlist: + if isinstance(dlist,str): + if re.search(r'[::]',dlist): + dlist = split_list(r'[::]',dlist) + else: + dlist = [dlist] + for date in dlist: + try: + datetime.datetime.strptime(date, '%Y-%m-%d') + except ValueError as e: + print("Incorrect data format, should be YYYY-MM-DD",e) + return 0,None + return 1,dlist + return 0,None