From 122b75ee3fd13808ec39d2c67920bce7652a5bb9 Mon Sep 17 00:00:00 2001
From: autolordz <autolordz@gmail.com>
Date: Fri, 20 Sep 2019 18:47:36 +0800
Subject: [PATCH] seperate pyfiles and optimize group cases

---
 README.md              |  57 +--
 configure.py           |  56 +++
 copyinfos.py           | 133 +++++++
 demo_docs/conf.txt     |  13 +
 df_progress.py         | 157 ++++++++
 df_transform.py        | 106 ++++++
 docx-content-modify.py | 801 +++--------------------------------------
 getjdocs.py            | 126 +++++++
 globalvar.py           |  70 ++++
 rqlist.md              |   5 +
 util.py                | 135 +++++++
 11 files changed, 871 insertions(+), 788 deletions(-)
 create mode 100644 configure.py
 create mode 100644 copyinfos.py
 create mode 100644 demo_docs/conf.txt
 create mode 100644 df_progress.py
 create mode 100644 df_transform.py
 create mode 100644 getjdocs.py
 create mode 100644 globalvar.py
 create mode 100644 rqlist.md
 create mode 100644 util.py
diff --git a/README.md b/README.md
index 888bd01..b098bd1 100644
--- a/README.md
+++ b/README.md
@@ -3,26 +3,10 @@
 > * 法院法务自动化批量生成邮寄单据-Legal agency postal notes automatically generate app
 > * 给予法务邮递人员从法务OA数据表(excel)和公开的判决书(docx)提取当事人地址内容，批量直接生成邮单。 减轻相关员负担，尤其系列案，人员多地址多，手工输入地址重复性劳动太多，信息容易错漏
 
-
-
 [![](https://img.shields.io/github/release/autolordz/docx-content-modify.svg?style=popout&logo=github&colorB=ff69b4)](https://github.com/autolordz/docx-content-modify/releases)
 [![](https://img.shields.io/badge/github-source-orange.svg?style=popout&logo=github)](https://github.com/autolordz/docx-content-modify)
 [![](https://img.shields.io/github/license/autolordz/docx-content-modify.svg?style=popout&logo=github)](https://github.com/autolordz/docx-content-modify/blob/master/LICENSE)
 
-## 目录
-
-<!-- MarkdownTOC autoanchor="true" autolink="true" uri_encoding="false" -->
-
-- [环境](#环境)
-- [更新](#更新)
-- [内容](#内容)
-- [规则](#规则)
-- [详细指南](#详细指南)
-- [Licence](#licence)
-
-<!-- /MarkdownTOC -->
-
-<a id="环境"></a>
 ## 环境
 
 > * conda : 4.6.14
@@ -31,18 +15,17 @@
 > * 组件: python-docx,pandas,StyleFrame,configparser  
 > * 打包程序: pyinstaller 
 
-<a id="更新"></a>
 ## 更新
 
-【2019-6-19】  
-> * 添加合并系列案功能，节省打印资源
+【2019-9-19】
 
-【2019-6-12】
+> * 整理合并系列案功能，优化代码
 
-> * 更新判决书过滤词汇
+【2019-6-19】
+
+> * 添加合并系列案功能，节省打印资源
 
 
-<a id="内容"></a>
 ## 内容
 
 - [x] 按格式重命名判决书
@@ -57,7 +40,6 @@
 - [x] 按照Data表输出寄送邮单
 	- [x] 填充好所有信息，再次运行就能输出Data表指定邮单
 
-<a id="规则"></a>
 ## 规则
 
 1. 当事人收信规则，没代理律师的每个当事人一份，有委托律师的只要寄给律师一份，多个律师寄给第一个律师，同一律所也是一份 
@@ -89,21 +71,17 @@ Data表部分字段演示：
 
 4. 【适用程序】规则(系列案用):  
 
-此处在OA表的【适用程序】填写,人为判断几个案是同一系列案的请在该字段中标注len(str)>3的唯一记号,系列案会自动合并
-
-len(str)>3 = 记号多于三个字符 
+此处在OA表中当事人几个案件中完全相同就合并为一个案件,发一次邮单,假如人员稍有差别,仍然按原来分开处理  
 
 例如：  
 
 | 【适用程序】 | 【案号】 |
 | --- | --- |
-| AAA       | 2773 |
-| 2774-2776 | 2774 |
-| 2774-2776 | 2775 |
-| 2774-2776 | 2776 |
+| 2160、2161_集合 | 2160 |
+| 2160、2161_集合 | 2161 |
 
 
-5. config.txt:  
+5. conf.txt:  
 ```python
 [config]
 data_xlsx = data_main.xlsx    # 数据模板地址
@@ -114,20 +92,18 @@ flag_append_oa = 1    # 是否导入OA数据
 flag_to_postal = 1    # 是否打印邮单
 flag_check_jdocs = 0    # 是否检查用户格式,输出提示信息
 flag_check_postal = 0    # 是否检查邮单格式,输出提示信息
-flag_output_log = 1    # 是否保存打印
-data_case_codes =    # 指定打印案号,可接多个,示例:AAA,BBB,优先级1
+data_case_codes =    # 指定打印案号,可接多个,示例:AAA号,BBB号,优先级1
 data_date_range =   # 指定打印数据日期范围示例:2018-09-01:2018-12-01,优先级2
-data_last_lines = 10    # 指定打印最后行数,优先级3
+data_last_lines = 3    # 指定打印最后行数,优先级3
 ```
 
-<a id="详细指南"></a>
 ## 详细指南
 
 简称：  
-- [A表: data_oa.xlsx,OA表自己下载,这个只是参考](./demo_docs/data_oa.xlsx)
-- [B表: data_main.xlsx,会自动生成,也要修改](./demo_docs/data_main.xlsx)
-- [C目录: jdocs/,判决书目录,要放下载的判决书](./demo_docs/jdocs/)
-- [D文档: sheet.docx,邮单模板,按照背景生成邮单](./demo_docs/sheet.docx)
+- [A表: data_oa.xlsx,OA表自己下载,这个只是参考](./demo_docs/data_oa.xlsx)  
+- [B表: data_main.xlsx,会自动生成,也要修改](./demo_docs/data_main.xlsx)  
+- [C目录: jdocs/,判决书目录,要放下载的判决书](./demo_docs/jdocs/)  
+- [D文档: sheet.docx,邮单模板,按照背景生成邮单](./demo_docs/sheet.docx)  
 - [E目录: postal/,邮单目录](./demo_docs/postal/)  
 
 1. 根据 **A表** 格式,整理自己的OA表(没数据是没用的),先在OA表中修改【适用程序】(系列案),修改conf.txt文件,参考[规则](#规则),如文件丢失再次运行会生成  
@@ -147,9 +123,8 @@ data_last_lines = 10    # 指定打印最后行数,优先级3
 5. 第二次运行(带【诉讼代理人】)  
 会重复 3.4.  3.5. 3.6.  
 
-6. 小白没有python环境，可以直接下载最新的exe版本，[win7/win10(32/64))](https://github.com/autolordz/docx-content-modify/releases/download/1.0.1/exe-win7win10-8962f68c.zip)，仍然需要设置config文件
+6. 小白没有python环境，可以直接下载最新的exe版本，使用前先配置conf.txt文件
 
-<a id="licence"></a>
 ## Licence
 
 [See Licence](https://github.com/autolordz/docx-content-modify/blob/master/LICENSE)
diff --git a/configure.py b/configure.py
new file mode 100644
index 0000000..eaa427e
--- /dev/null
+++ b/configure.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 11 11:41:46 2019
+
+@author: autol
+"""
+
+import configparser
+
+#%% config and default values
+
+def write_config(cfgfile):
+    cfg = configparser.ConfigParser(allow_no_value=1,
+                                    inline_comment_prefixes=('#', ';'))
+
+    cfg['config'] = dict(
+            data_xlsx = 'data_main.xlsx    # 数据模板地址',
+            data_oa_xlsx = 'data_oa.xlsx    # OA数据地址',
+            sheet_docx = 'sheet.docx    # 邮单模板地址',
+            flag_fill_jdocs_infos = '1    # 是否填充判决书地址',
+            flag_append_oa = '1    # 是否导入OA数据',
+            flag_to_postal = '1    # 是否打印邮单',
+            flag_check_jdocs = '0    # 是否检查用户格式,输出提示信息',
+            flag_check_postal = '0    # 是否检查邮单格式,输出提示信息',
+            data_case_codes = '   # 指定打印案号,可接多个,示例:AAA,BBB,优先级1',
+            data_date_range = '  # 指定打印数据日期范围示例:2018-09-01:2018-12-01,优先级2',
+            data_last_lines = '3    # 指定打印最后行数,优先级3',
+        )
+
+    with open(cfgfile, 'w',encoding='utf-8-sig') as configfile:
+        cfg.write(configfile)
+    print('>>> 重新生成配置 %s ...'%cfgfile)
+    return cfg['config']
+
+
+#%%
+def read_config(cfgfile):
+    cfg = configparser.ConfigParser(allow_no_value=True,
+                                    inline_comment_prefixes=('#', ';'))
+    cfg.read(cfgfile,encoding='utf-8-sig')
+    ret = dict(
+            data_xlsx = cfg['config']['data_xlsx'],
+            data_oa_xlsx = cfg['config']['data_oa_xlsx'],
+            sheet_docx = cfg['config']['sheet_docx'],
+            data_case_codes = cfg['config']['data_case_codes'],
+            data_date_range = cfg['config']['data_date_range'],
+            data_last_lines = cfg['config']['data_last_lines'],
+            flag_fill_jdocs_infos = int(cfg['config']['flag_fill_jdocs_infos']),
+            flag_append_oa = int(cfg['config']['flag_append_oa']),
+            flag_to_postal = int(cfg['config']['flag_to_postal']),
+            flag_check_jdocs = int(cfg['config']['flag_check_jdocs']),
+            flag_check_postal = int(cfg['config']['flag_check_postal']),
+        )
+    return ret
+#    return dict(cfg.items('config'))
+
diff --git a/copyinfos.py b/copyinfos.py
new file mode 100644
index 0000000..1886645
--- /dev/null
+++ b/copyinfos.py
@@ -0,0 +1,133 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 11 12:08:08 2019
+
+@author: autol
+"""
+
+
+#%%
+import re
+from collections import Counter
+from util import split_list,user_to_list,save_adjust_xlsx
+from globalvar import *
+
+#%%
+
+def copy_users_compare(jrow,df,errs=list('    ')):
+    '''copy users and check users completement
+    errs=['【OA无用户记录】','【用户错别字】','【字段重复】','【系列案】']
+    如下对比：
+    不相交，OA无用户记录
+    判断字段重复,输出重复的内容
+    比例确定怀疑用户错别字，判别不了直接正常输出
+    判决书多于当前案件,认为是系列案
+    判决书少于当前案件,当前案件缺部分地址
+    '''
+
+    code0 = str(df['案号']).strip()
+    code1 = str(df['原一审案号']).strip()
+    jcode = str(jrow['判决书源号']).strip()
+    x = Counter(user_to_list(df['当事人'])) # 当前案件
+    y = Counter(list(jrow['new_adr'].keys())) # 判决书
+    rxy = len(list((x&y).elements()))/len(list((x|y).elements()))
+    rxyx = len(list((x&y).elements()))/len(list(x.elements()))
+    rxyy = len(list((x&y).elements()))/len(list(y.elements()))
+#    print('x=',x);print('y=',y);print('rxy=',rxy)
+#    print('rxyx=',rxyx);print('rxyy=',rxyy)
+    if rxy == 0: # 不相交，完全无关
+        return errs[0]
+    if max(x.values()) > 1 or max(y.values()) > 1: # 有字段重复
+        xdu = [k for k,v in x.items() if v > 1] # 重复的内容
+        ydu = [k for k,v in y.items() if v > 1]
+        print_log('>>> %s 用户有字段重复【%s】-【案件:%s】 vs 【判决书:%s】'
+                  %(code0,'{0:.0%}'.format(rxy),xdu,ydu))
+        return errs[2]
+    if rxy == 1: # 完全匹配
+        return df['当事人']
+    if 0 < rxy < 1: # 错别字
+        dx = list((x-y).elements())
+        dy = list((y-x).elements())
+        xx = Counter(''.join(dx))
+        yy = Counter(''.join(dy))
+        rxxyy = len(list(xx&yy.keys()))/len(list(xx|yy.keys()))
+#        print('rxxyy=',rxxyy)
+        if rxxyy >= .6:
+            print_log('>>> %s 认为【错别字率 %s】->【案件:%s vs 判决书:%s】'
+                      %(code0,'{0:.0%}'.format(1-rxxyy),dx,dy))
+            return errs[1]
+        elif rxxyy >= .2:
+            print_log('>>> %s 认为【不好判断当正常处理【差异率 %s】vs【相同范围:%s】->【差异范围:案件:%s vs 判决书:%s】 '
+                          %(code0,'{0:.0%}'.format(1-rxxyy),
+                            list((x&y).elements()),
+                            dx,dy))
+            return df['当事人']
+    if rxyx > .8:
+        print_log('>>> %s 案件 %s人 < 判决书  %s人'%(code0,len(x),len(y)))
+        if jcode != code1:# 系列案
+            print_log('>>> %s 认为【系列案,判决书人员 %s 多出地址】'%(code0,list((y-x).elements())))
+            return errs[3]
+        else:
+            return df['当事人']
+    elif rxyy > .8:
+        print_log('>>> %s 案件 %s人 > 判决书 %s人'%(code0,len(x),len(y)))
+        print_log('>>> %s 认为【当前案件人员 %s 缺地址】'%(code0,list((x-y).elements())))
+        return df['当事人']
+    return errs[0]
+
+
+def copy_rows_adr1(x,n_adr):
+    ''' copy jdocs address to address column
+        格式:['当事人','诉讼代理人','地址','new_adr','案号']
+        同时排除已有代理人的信息
+    '''
+    user = x['当事人'];agent = x['诉讼代理人'];adr = x['地址']; codes = x['案号']
+    if not isinstance(n_adr,dict):
+        return adr
+    else:
+        y = split_list(r'[,，]',adr)
+        adr1 = y.copy()
+        for i,k in enumerate(n_adr):
+            by_agent = any([k in ag for ag in re.findall(r'[\w+、]*\/[\w+]*',agent)]) # 找到代理人格式 'XX、XX/XX_123123'
+            if by_agent and k in adr: # remove user's address when user with agent 用户有代理人就不要地址
+                y = list(filter(lambda x:not k in x,y))
+            if type(n_adr) == dict and not k in adr and k in user and not by_agent:
+                y += [k+adr_tag+n_adr.get(k)] # append address by rules 输出地址格式
+        adr2 = y.copy()
+        adr =  '，'.join(list(filter(None, y)))
+        if Counter(adr1) != Counter(adr2) and adr and flag_check_jdocs:
+            print_log('>>> 【%s】成功复制判决书地址=>【%s】'%(codes,adr))
+    return adr
+
+address_tmp_xlsx = 'address_tmp.xlsx'
+
+def copy_rows_user_func(dfj,dfo):
+
+    '''copy users line regard adr user'''
+    errs = ['【OA无用户记录】','【用户错别字】','【字段重复】','【系列案】']
+
+    dfo['判决书源号'] = ''
+
+    def find_source():
+        print_log('\n>>> 判决书信息 | 案号=%s | 源号=%s | 判决书源号=%s'%(code0,code1,jcode))
+        dfo.loc[i,'地址'] = copy_rows_adr1(dfor,n_adr)
+        dfo.loc[i,'判决书源号'] = jcode
+
+    for (i,dfor) in dfo.iterrows():
+        for (j,dfjr) in dfj.iterrows():
+            code0 = str(dfor['案号']).strip()
+            code1 = str(dfor['原一审案号']).strip()
+            jcode = str(dfjr['判决书源号']).strip()
+            n_adr = dfjr['new_adr']
+            if isinstance(n_adr,dict):
+                if not n_adr:continue# 提取jdocs字段失败
+                if code1 == jcode:# 同案号，则找到内容
+                    find_source() ; break
+                else:#[::-1] # 没案号
+                    tag1 = copy_users_compare(dfjr,dfor,errs)
+                    if tag1 not in errs:
+                        find_source() ; break
+                    else: pass
+    dfj = dfj.fillna('')
+    save_adjust_xlsx(dfj,address_tmp_xlsx,textfit=('判决书源号','new_adr')) # 保存临时提取信息
+    return dfo
\ No newline at end of file
diff --git a/demo_docs/conf.txt b/demo_docs/conf.txt
new file mode 100644
index 0000000..c6c5668
--- /dev/null
+++ b/demo_docs/conf.txt
@@ -0,0 +1,13 @@
+﻿[config]
+data_xlsx = data_main.xlsx    # 数据模板地址
+data_oa_xlsx = data_oa.xlsx    # OA数据地址
+sheet_docx = sheet.docx    # 邮单模板地址
+flag_fill_jdocs_infos = 1    # 是否填充判决书地址
+flag_append_oa = 1    # 是否导入OA数据
+flag_to_postal = 1    # 是否打印邮单
+flag_check_jdocs = 0    # 是否检查用户格式,输出提示信息
+flag_check_postal = 0    # 是否检查邮单格式,输出提示信息
+data_case_codes =    # 指定打印案号,可接多个,示例:AAA,BBB,优先级1
+data_date_range =   # 指定打印数据日期范围示例:2018-09-01:2018-12-01,优先级2
+data_last_lines = 3    # 指定打印最后行数,优先级3
+
diff --git a/df_progress.py b/df_progress.py
new file mode 100644
index 0000000..48edba7
--- /dev/null
+++ b/df_progress.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 11 15:11:39 2019
+
+@author: autol
+"""
+
+import os,re
+from glob import glob
+import pandas as pd
+import util as ut
+from copyinfos import copy_rows_user_func
+from getjdocs import get_all_jdocs,rename_jdocs_codes
+from globalvar import *
+
+#%%
+
+def df_oa_append(dfo):
+    '''main fill OA data into df data'''
+
+    if flag_append_oa:
+        if not os.path.exists(data_oa_xlsx):
+            print_log('>>> 没有找到OA模板 %s...不处理！！'%data_oa_xlsx);return dfo
+        dfoa = pd.read_excel(data_oa_xlsx,sort=False)[titles_oa].fillna('') # only oa columns
+        df0 = dfo.copy()
+
+        if '适用程序' not in dfo.columns: dfo['适用程序'] = '' # 新建一栏用于系列案
+        print('>>> OA数据共%s条'%len(dfoa))
+        dfoa = df_make_subset(dfoa) # subset by columns
+
+        dfoa.rename(columns={'承办人':'主审法官'},inplace=True)
+        dfoa = df_read_fix(dfoa) # fix empty data columns
+        dfoa['add_index'] = 'new' ;  dfo['add_index'] = 'old'
+
+        ec = list(set(ut.expand_codes(dfo['案号'].to_list()))) # 展开案号
+        add =  dfoa[~dfoa['案号'].isin(ec)] # 新增条目
+        dfo = pd.concat([dfo,add],sort=1).fillna('')
+        dfo.sort_values(by=['立案日期','案号'],inplace=True)
+#        dfo.drop_duplicates(['立案日期','案号'],keep='first',inplace=True)
+        df_t0 = dfo[dfo['add_index'] == 'old'] ; df_t1 = dfo[dfo['add_index'] == 'new']
+        df_t1l = df_t1['立案日期'].to_list()
+        print_log('>>> 截取OA【%s条】-Data记录 old【%s条】-new【%s条】...'%(len(dfoa),len(df_t0),len(df_t1)))
+        if df_t1l:
+            print_log('>>> 实际添加【%s条】【%s】条共【%s】...'%(len(df_t1),
+                                                   str(df_t1l[0])+':'+str(df_t1l[-1]),
+                                                   len(dfo)))
+    save_df(df0,dfo)
+    return dfo,dfoa['原一审案号'].to_list()
+
+def merge_group_cases(dfo):
+
+    dfn = dfo.copy() # [dfo['适用程序'].str.len()>2]
+    ds = dfo[['适用程序','当事人']].drop_duplicates().copy();ds # 依据'适用程序','当事人'定性系列案
+    for tag1,tag2 in zip(ds['适用程序'].to_list(),ds['当事人'].to_list()): # ds是系列案标签和内容
+        if tag2:
+            dgroup = dfo[dfo['当事人']==tag2] # 查找dfo拥有的系列案
+        elif tag1:
+            dgroup = dfo[dfo['适用程序']==tag1] # 先查 '当事人' 后查 '适用程序'
+        if len(dgroup) > 1:
+            ss = dgroup.iloc[0].copy() # 系列案选一个
+            if done_tag not in ss['适用程序']: # 处理系列案
+                sn0 = dgroup['案号'].to_list()
+                sn = [re.search(r'\d+(?=号)|$',x).group(0) for x in sn0]
+                if sn[0] != sn[-1]:
+#                    ss['案号'] = re.sub(r'\d+(?=号)','%s-%s'%(sn[0],sn[-1]),sn0[0])
+                    ss['案号'] = re.sub(r'\d+(?=号)','、'.join(sn),sn0[0])
+                    print_log('>>> 发现并合并系列案：', ss['案号'])
+                    if not ss['适用程序']: ss['适用程序'] = '、'.join(sn)
+                    ss['适用程序'] += done_tag
+                dfn = pd.concat([dfn[~dfn.isin(dgroup).all(1)], ss.to_frame().T]) #合并系列并过滤原来条目
+    save_df(dfo,dfn)
+    return dfn
+
+#%% df process steps
+
+def fill_infos_func(dfj,dfo):
+    '''填充判决书内容'''
+    dfn = copy_rows_user_func(dfj,dfo)
+    rename_jdocs_codes(dfn)
+    return dfn
+
+
+def df_read_fix(df):
+    '''fix codes remove error format 处理案号格式'''
+    df[['立案日期','案号','主审法官','当事人']] = df[['立案日期','案号','主审法官','当事人']].replace('',float('nan'))
+    df.dropna(how='any',subset=['立案日期','案号','主审法官','当事人'],inplace=True)
+    df['原一审案号'] = df['原一审案号'].fillna('')
+    df[['案号','原一审案号']] = df[['案号','原一审案号']].applymap(ut.case_codes_fix)
+    return df
+
+def df_fill_infos(dfo):
+    '''main fill jdocs infos'''
+    if len(dfo) == 0:  return dfo
+    docs = glob(ut.parse_subpath(jdocs_path,'*.docx')) # get jdocs
+    if not docs: return dfo
+    dfj = get_all_jdocs(docs)
+    if len(dfj) == 0: print_log('>>> 没有找到判决书...不处理！！') ; return dfo
+    dfn = dfo.copy()
+    dfn = fill_infos_func(dfj,dfn)
+    if flag_fill_jdocs_infos:
+        save_df(dfo,dfn)
+    return dfn
+
+def save_df(df_old,df_new): # 内容相同就不管
+    '''保存并对比记录'''
+    try:
+        df_old = ut.titles_resort(df_old,titles_main)
+        df_new = ut.titles_resort(df_new,titles_main)
+        pd.testing.assert_frame_equal(df_old,df_new)
+        print_log('\n>>> 内容没变,不用保存 ..\n')
+        return 0
+    except Exception: # 不同则保存
+        ut.save_adjust_xlsx(df_new,data_xlsx)
+        return 1
+
+def df_make_subset(df):
+
+    '''
+    cut orgin data into subset by conditions
+    d_codes: 多个指定案号例如: （2018）哈哈1234号,（2018）哈哈3333号
+    d_range: 2019-08-13：2019-08-27
+    '''
+    d_codes,d_range,d_lines = data_case_codes,data_date_range,0
+    if data_last_lines:  d_lines = int(data_last_lines)
+
+    ct,dats = ut.check_time(d_range);dats
+#        ct,dats = check_time('2019-08-13：2019-08-27');dats
+    if d_codes:
+        dcc = ut.split_list(r'[,，;；]',d_codes)
+        dcc = list(filter(None,[ut.case_codes_fix(x) for x in dcc]))
+        df = df[df['案号'].isin(dcc) | df['原一审案号'].isin(dcc)]
+#        df1 = df[df['案号'].isin(dcc) | df['原一审案号'].isin(dcc)];df1
+    elif ct:
+        print_log('\n>>> 预定读取【%s】'%d_range)
+        df['立案日期'] = pd.to_datetime(df['立案日期'])
+        df.sort_values(by=['立案日期'],inplace=True)
+        try:
+            x = dats[0]
+            if len(dats) == 1:
+                y = dats[0] #  y = str(datetime.date.today())
+            else:
+                y = dats[1]
+            x,y = ut.parse_datetime(x),ut.parse_datetime(y)
+            x1 = df['立案日期'].iloc[0].to_pydatetime()
+            y1 = df['立案日期'].iloc[-1].to_pydatetime()
+            t1 = min(x,y); t2 = max(x,y)
+            t1 = max(t1,x1);t2 = min(t2,y1)
+            date_start = t1 if t1 else x1
+            date_end = t2 if t2 else y1
+            df = df[(df['立案日期']>=date_start)&(df['立案日期']<=date_end)].copy() #这里数据分片有警告
+            df['立案日期'] = df['立案日期'].astype(str)
+        except Exception as e:
+            print_log('>>> 日期异常',e)
+    elif d_lines:
+        df = df.tail(d_lines)
+    return df
+
diff --git a/df_transform.py b/df_transform.py
new file mode 100644
index 0000000..b565d75
--- /dev/null
+++ b/df_transform.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 11 17:26:56 2019
+
+@author: autol
+"""
+import re
+import pandas as pd
+import util as ut
+from globalvar import *
+
+#%% df tramsfrom functions
+def clean_rows_aname(x,names):
+    '''Clean agent name for agent to match address's agent name'''
+    if names:
+        for name in names:
+            if not ut.check_cn_str(name):continue # 非中文名跳过
+            if name in x:
+                x = name;break
+    x = re.sub(r'_.*','',x)
+    x = re.sub(path_names_clean,'',x)
+    return x
+
+def clean_rows_adr(adr):
+    '''clean adr format'''
+    y = ut.split_list(r'[,，]',adr)
+    if y:
+        y = list(map(lambda x: x if re.search(r'\/地址[:：]',x) else adr_tag + x,y))
+        adr = '，'.join(list(filter(None, y)))
+    return adr
+
+def make_adr(adr,fix_aname=[]):
+    '''
+    clean_aname:合并标识,此处如果没律师，则代理人就是自己
+    fix_aname:修正名字错误
+    Returns:
+          level_0       address        clean_aname
+    0       44      XX市XX镇XXX村          张三
+    1       44      XXX市XX区XXX          B律师
+    '''
+    adr = adr[adr != '']
+    adr = adr.str.strip().str.split(r'[,，。]',expand=True).stack()
+    adr = adr.str.strip().apply(lambda x:clean_rows_adr(x))
+    adr = adr.str.strip().str.split(r'\/地址[:：]',expand=True).fillna('')
+    adr.columns = ['aname','address']
+    adr['clean_aname'] = adr['aname'].str.strip().apply(lambda x:clean_rows_aname(x,fix_aname)) # clean adr
+    adr = adr.reset_index().drop(['level_1','aname'],axis=1)
+    return adr
+
+def make_agent(agent,fix_aname=[]):
+    '''
+    fix_aname:修正名字错误,假如律师(aname)有多个,则选择第一个律师作为合并标识(clean_aname)，注意没有律师的合并就是自己(uname)做代理人
+    Returns:
+       level_0       uname            aname              clean_aname
+    0       44         张三          A律师_123213123                A律师
+    1       44         李四
+    2       44         王五       B律师_123123132123、C律师_123123   B律师
+    '''
+    agent = agent[agent != '']
+    agent = agent.str.strip().str.split(r'[,，。]',expand=True).stack() #Series
+    agent = agent.str.strip().str.split(r'\/',expand=True).fillna('') #DataFrame
+    agent.columns = ['uname','aname']
+    agent['clean_aname'] = agent['aname'].str.strip().apply(lambda x: clean_rows_aname(x,fix_aname))
+    dd_l = agent['uname'].str.strip().str.split(r'、',expand=True).stack().to_frame(name = 'uname').reset_index()
+    dd_r = agent[agent.columns.difference(['uname'])].reset_index()
+    agent = pd.merge(dd_l,dd_r,how='outer',on=['level_0','level_1']).drop(['level_1','level_2'],axis=1).fillna('')
+    return agent
+
+def merge_user(user,agent):
+    '''合并后以uname为主,clean_aname是律师标识
+    Returns:
+       level_0       uname            aname              clean_aname
+    0       44         张三          A律师_123213123                A律师
+    2       44         王五       B律师_123123132123、C律师_123123   B律师
+    '''
+    return pd.merge(user,agent,how='left',on=['level_0','uname']).fillna('')
+
+def merge_usr_agent_adr(agent,adr):
+    ''' clean_aname 去除nan,保留曾用名'''
+
+    agent['clean_aname'].replace('',float('nan'),inplace=True)
+    agent['clean_aname'] = agent['clean_aname'].fillna(agent['uname']).replace(path_names_clean,'')
+    adr['clean_aname'] = adr['clean_aname'].apply(lambda x: clean_rows_aname(x,agent['clean_aname'].tolist()))
+    tb = pd.merge(agent,adr,how='outer',on=['level_0','clean_aname']).fillna('')
+    tb.dropna(how='all',subset=['uname', 'aname'],inplace=True)
+    return tb
+
+def reclean_data(tb):
+    tg = tb.groupby(['level_0','clean_aname','aname','address'])['uname'].apply(lambda x: '、'.join(x.astype(str))).reset_index()
+    glist = tg['uname'].str.split(r'、',expand=True).stack().values.tolist()
+    rest = tb[tb['uname'].isin(glist) == False]
+    x = pd.concat([rest,tg],axis=0,sort=True)
+    return x
+
+def sort_data(x,number):
+    x = x[['level_0','uname','aname','address']].sort_values(by=['level_0'])
+    x = pd.merge(number,x,how='right',on=['level_0']).drop(['level_0'],axis=1).fillna('')
+    return x
+
+def df_check_format(x):
+    '''check data address and agent format with check flag'''
+    if x['aname']!='' and not re.search(r'[\/_]',x['aname']):
+        ut.print_log('>>> 记录\'%s\'---- 【诉讼代理人】格式 \'%s\' 不正确,如无请留空,请自行修改...'%(x['number'],x['aname']))
+    if x['address']!='' and not re.search(r'\/地址[:：]',x['address']):
+        ut.print_log('>>> 记录\'%s\'---- 【地址】格式 \'%s\' 不正确,如无请留空,请自行修改...'%(x['number'],x['address']))
+    return x
\ No newline at end of file
diff --git a/docx-content-modify.py b/docx-content-modify.py
index ee6c4f9..76d2729 100644
--- a/docx-content-modify.py
+++ b/docx-content-modify.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Copyright (c) 2018 Autoz https://github.com/autolordz
 
 # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -18,762 +19,74 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 11 17:33:29 2019
+
+@author: autol
+"""
 
 #%%
-import os,re,sys,datetime,configparser,shutil
+import os,re
 import pandas as pd
-from pandas import DataFrame, read_excel, merge, concat, set_option, to_datetime
-isStyleFrame = 1
-from StyleFrame import StyleFrame, Styler
-from collections import Counter
 from docx import Document
-from glob import glob
-set_option('max_colwidth',500)
-set_option('max_rows', 50)
-set_option('max_columns',50)
-
-flag_print = 0
-flag_output_log = 1
-
-cfgfile = 'conf.txt'
-logname = 'log.txt'
-data_xlsx = 'data_main.xlsx'
-data_oa_xlsx = 'data_oa.xlsx'
-sheet_docx = 'sheet.docx'
-address_tmp_xlsx = 'address_tmp.xlsx'
-postal_path = os.path.join('.','postal')
-jdocs_path = os.path.join('.','jdocs')
-flag_fill_jdocs_infos = 1
-flag_append_oa = 1
-flag_to_postal = 1
-flag_check_jdocs = 0
-flag_check_postal = 0
-data_case_codes = 'AAA,BBB'
-data_date_range = '2018-09-01:2018-12-01'
-data_last_lines = 10
-conf_list = 0
-
-#%% print_log log
-
-if os.path.exists(logname):
-    os.remove(logname)
-def print_log(*args, **kwargs):
-    print(*args, **kwargs)
-    if flag_output_log:
-        with open(logname, "a",encoding='utf-8') as file:
-            print(*args, **kwargs, file=file)
-    else:
-        if os.path.exists(logname):
-            os.remove(logname)
+#import sys
+#sys.stderr = open(os.devnull, "w")  # silence stderr
+from globalvar import *
+
+#%%
+import util as ut
+from df_progress import df_fill_infos,df_oa_append,df_read_fix,merge_group_cases
+from df_transform import df_check_format
+from df_transform import make_adr,make_agent,merge_user,merge_usr_agent_adr
+from df_transform import reclean_data,sort_data
 
 #%%
-print_log('''
+print('''
 Postal Notes Automatically Generate App
 
-Updated on Thu Jun 19 2019
+Updated on Thu Sept 19 2019
 
 Depends on: python-docx,pandas,StyleFrame,configparser
 
 @author: Autoz (autolordz@gmail.com)
 ''')
-#%% config and default values
-
-
-def set_default_value(**kwargs):
-    global data_date_range
-    data_date_range = kwargs.get('data_date_range') if kwargs.get('data_date_range') != None else '# 2018-01-01:2018-12-01'
-    
-def write_config():
-    cfg = configparser.ConfigParser(allow_no_value=True,
-                                    inline_comment_prefixes=('#', ';'))
-    cfg['config'] = {'data_xlsx': data_xlsx+'    # 数据模板地址',
-                     'data_oa_xlsx': data_oa_xlsx+'    # OA数据地址',
-                     'sheet_docx': sheet_docx+'    # 邮单模板地址',
-                     'flag_fill_jdocs_infos': str(int(flag_fill_jdocs_infos))+'    # 是否填充判决书地址',
-                     'flag_append_oa': str(int(flag_append_oa))+'    # 是否导入OA数据',
-                     'flag_to_postal': str(int(flag_to_postal))+'    # 是否打印邮单',
-                     'flag_check_jdocs': str(int(flag_check_jdocs))+'    # 是否检查用户格式,输出提示信息',
-                     'flag_check_postal': str(int(flag_check_postal))+'    # 是否检查邮单格式,输出提示信息',
-                     'flag_output_log': str(flag_output_log)+'    # 是否保存打印',
-                     'data_case_codes': '   # 指定打印案号,可接多个,示例:AAA,BBB,优先级1',
-                     'data_date_range': '  # 指定打印数据日期范围示例:%s,优先级2'%(data_date_range),
-                     'data_last_lines': str(data_last_lines)+'    # 指定打印最后行数,优先级3',
-                     }
-    with open(cfgfile, 'w',encoding='utf-8-sig') as configfile:
-        cfg.write(configfile)
-    print_log('>>> 重新生成配置 %s ...'%cfgfile)
-
-def read_config():
-    global data_xlsx,data_oa_xlsx,sheet_docx,address_tmp_xlsx,postal_path
-    global jdocs_path,data_last_lines,data_date_range,data_case_codes
-    global flag_fill_jdocs_infos,flag_append_oa
-    global flag_to_postal,flag_check_jdocs,flag_check_jdocs,flag_check_postal,flag_output_log
-    cfg = configparser.ConfigParser(allow_no_value=True,
-                                    inline_comment_prefixes=('#', ';'))
-    cfg.read(cfgfile,encoding='utf-8-sig')
-    data_xlsx = cfg['config']['data_xlsx']
-    data_oa_xlsx = cfg['config']['data_oa_xlsx']
-    sheet_docx = cfg['config']['sheet_docx']
-    data_case_codes = cfg.get('config', 'data_case_codes',fallback=data_case_codes)
-    data_date_range = cfg.get('config', 'data_date_range',fallback=data_date_range)
-    data_last_lines = int(cfg.get('config','data_last_lines',fallback=data_last_lines))
-    flag_fill_jdocs_infos = int(cfg.get('config', 'flag_fill_jdocs_infos',fallback=flag_fill_jdocs_infos))
-    flag_append_oa = int(cfg.get('config', 'flag_append_oa',fallback=flag_append_oa))
-    flag_to_postal = int(cfg.get('config', 'flag_to_postal',fallback=flag_to_postal))
-    flag_check_jdocs = int(cfg.get('config', 'flag_check_jdocs',fallback=flag_check_jdocs))
-    flag_check_postal = int(cfg.get('config', 'flag_check_postal',fallback=flag_check_postal))
-    flag_output_log = int(cfg.get('config', 'flag_output_log',fallback=flag_output_log))
-    return dict(cfg.items('config'))
-#%% global variable
-
-titles_cn = ['立案日期','案号','当事人','诉讼代理人','地址']
-titles_en = ['datetime','number','uname','aname','address']
-titles_oa = ['立案日期','案号','原一审案号','承办人','当事人','适用程序']
-titles_main = ['立案日期','适用程序','案号','原一审案号','判决书源号','主审法官','当事人','诉讼代理人','地址',]
-
-path_names_clean = re.compile(r'[^A-Za-z\u4e00-\u9fa5（）()：]') # remain only name including old name 包括括号冒号
-search_names_phone = lambda x: re.search(r'[\w（）()：:]+\_\d+',x)  # phone numbers
-path_code_ix = re.compile(r'[(（][0-9]+[)）].*?号') # case numbers
-adr_tag = '/地址：'
-
-#%% read func
-def split_list(regex,L):
-    return list(filter(None,re.split(regex,L)))
-
-def user_to_list(u):
-    '''get name list from user string
-    Usage: '申请人:张xx, 被申请人:李xx, 原审被告:罗xx（又名罗aa）' 
-    -> ['张xx', '李xx', '罗xx（又名罗aa）']
-    '''
-    u = split_list(r'[:、,，]',u)
-    return [x for x in u if not re.search(r'申请人|被申请人|原告|被告|原审被告|上诉人|被上诉人|第三人|原审诉讼地位',x)]
-
-def check_codes(x):
-    return bool(re.search(path_code_ix.pattern,str(x)))
-
-def case_codes_fix(x):
-    '''fix string with chinese codes format
-    Usage: 'dsfdsf(2018)中文中文248号sdfsdf' -> '（2018）中文中文248号'
-    '''
-    x = str(x)
-    x = re.search(path_code_ix.pattern+r'|$',x).group().strip().replace(' ','')
-    x = x.replace('(','（').replace(')','）')
-    return x
-
-def parse_subpath(path,file):
-    '''make subpath'''
-    if not os.path.exists(path):
-        os.mkdir(path)
-    return os.path.join(path,file)
-
-def check_cn_str(x):
-    '''check if string contain chinese'''
-    return bool(re.search(r'[\u4e00-\u9fa5]',str(x)))
-
-def parse_datetime(date):
-    '''datetime transform'''
-    try:date = datetime.datetime.strptime(date,'%Y-%m-%d')
-    except ValueError:print_log('时间范围格式有误,默认选取全部日期');date = ''
-    return date
-
-def titles_trans(df_list):
-    '''change titles between Chinese and English'''
-    titles_cn2en = dict(zip(titles_cn, titles_en))
-    titles_en2cn = dict(zip(titles_en, titles_cn))
-    trans_cn_en = list(map(lambda x,y:(titles_cn2en if y else titles_en2cn).get(x),
-                           df_list,list(map(check_cn_str,df_list))))
-    return trans_cn_en
-
-def titles_trans_columns(df,titles):
-    '''sub-replace columns titles you want'''
-    titles_rest = df.drop(titles,axis=1).columns.tolist()
-    df = df[titles + titles_rest]
-    df.columns = titles_trans(titles) + titles_rest
-    return df
-
-def titles_resort(df,titles):
-    '''resort titles with orders'''
-    titles_rest = df.drop(titles,axis=1).columns.tolist()
-    return df[titles + titles_rest]
-
-def save_adjust_xlsx(df,file='test.xlsx',textfit=('当事人', '诉讼代理人', '地址'),width=60):
-    '''save and re-adjust excel format'''
-    df = df.reset_index(drop='index').fillna('')
-    if isStyleFrame:
-        StyleFrame.A_FACTOR = 5
-        StyleFrame.P_FACTOR = 1.2
-        sf = StyleFrame(df,Styler(wrap_text = False, shrink_to_fit=True, font_size= 12))
-        if('add_index' in df.columns.tolist()):
-            sf.apply_style_by_indexes(indexes_to_style=sf[sf['add_index'] == 'new'],
-                                      styler_obj=Styler(bg_color='yellow'),
-                                      overwrite_default_style=False)
-            sf.apply_column_style(cols_to_style = textfit,
-                                  width = width,
-                                  styler_obj=Styler(wrap_text=False,shrink_to_fit=True))
-        else:
-            sf.set_column_width_dict(col_width_dict={textfit: width})
-        if len(df):
-            sf.to_excel(file,best_fit=sf.data_df.columns.difference(textfit).tolist()).save()
-        else:
-            sf.to_excel(file).save()
-    else:
-        df.to_excel(file,index=0)
-    print_log('>>> 保存文件 => 文件名 \'%s\' => 数据保存成功...' %(file))
-    return df
-
-#%%
-    
-def read_jdocs_table(tables):
-    codes = ''
-    for table in tables:
-        for row in table.rows:
-            for cell in row.cells:
-                for paragraph in cell.paragraphs:
-                    x = paragraph.text
-                    if re.search(path_code_ix,x) and len(x) < 25:
-                        codes = case_codes_fix(x)
-                        break
-    return codes
-
-def get_jdocs_infos(doc,lines = 20):# search at least 20 lines docs
-    '''get pre address from judgment docs, return docs pre code and address'''
-    adrs = {};codes = ''
-    try:tables = Document(doc).tables
-    except Exception as e: 
-        print('读取错误 %s ,docx文档问题,请重新另存为,或关闭已打开的docx文档'%e)
-        return codes,adrs
-    if tables: codes = read_jdocs_table(tables)
-    paras = Document(doc).paragraphs
-    if not paras:
-        return codes,adrs
-    if len(paras) > 20: # 多于20行就扫描一般内容
-        lines = int(len(paras)/2)
-    parass = paras[:lines]
-    for i,para in enumerate(parass):
-        x = para.text.strip()
-        if len(x) > 150: continue # 段落大于150字就跳过
-        if re.search(path_code_ix,x) and len(x) < 25:
-            codes = case_codes_fix(x);continue # codes
-        cond3 = re.search(r'法定代表|诉讼|代理人|判决|律师|请求|证据|辩称|辩论|不服',x) # 跳过非人员信息
-        cond4 = re.search(r'上市|省略|区别|借款|保证|签订',x) # 跳过非人员信息,模糊 
-        cond1 = re.search(r'(?<=[：:]).*?(?=[,，。])',x)
-        cond2 = re.search(r'.*?[省市州县区乡镇村]',x)
-        if cond3:continue
-        if cond4:continue
-        if cond1 and cond2:
-            '''
-            Todo: get user and address
-            Usage: '被上诉人（原审被告）：张三，男，1977年7月7日出生，汉族，住XX自治区(省)XX市XX区1212。现住XX省XX市XX区3434'
-            -> {'张三': 'XX省XX市XX区3434'}
-            '''
-            try:
-                name = re.search(r'(?<=[：:]).*?(?=[,，。])|$',x).group(0).strip()
-                name = re.sub(r'[(（][下称|原名|反诉|变更前].*?[）)]','',name) # filter some special names,notice here will add some words for filter
-                z = split_list(r'[,，:：.。]',x)
-                z = [re.sub(r'户[籍口]|居住|身份证|所在地|住所地?|住址?|^[现原]住?','',y) for y in z if re.search(r'.*?[省市州县区乡镇村]',y)][-1] # 几个地址选最后一个 remain only address
-                adr = {name:''.join(z)}
-                adrs.update(adr)
-            except Exception as e:
-                print_log('获取信息失败 =>',e)
-    return codes,adrs
-
-def rename_jdoc_x(doc,codes):
-    '''rename only judgment doc files'''
-    jdoc_name = os.path.join(os.path.split(doc)[0],'判决书_'+codes+'.docx')
-    if not codes in doc:# os.path.exists(jdoc_name)
-        try:
-            os.rename(doc,jdoc_name)
-            return True
-        except Exception as e:
-            print_log(e)
-            os.remove(doc)
-            return False
-    return False
-
-def get_all_jdocs(docs):
-    numlist=[]; nadr = []
-    for doc in docs:
-        codes,adrs = get_jdocs_infos(doc)
-        if codes:
-            rename_jdoc_x(doc,codes)
-        numlist.append(codes)
-        nadr.append(adrs)
-        if flag_check_jdocs and codes:
-            print_log('>>> 判决书信息 【%s】-【%s人】-%s \n'%(codes,len(adrs),adrs))
-    numlist = list(map(case_codes_fix,numlist))
-    return DataFrame({'判决书源号':numlist,'new_adr':nadr})
-
-#%%
-def copy_rows_adr(x):
-    ''' copy jdocs address to address column''' 
-    '''格式:['当事人','诉讼代理人','地址','new_adr','案号']'''
-    x[:3] = x[:3].astype(str)
-    user = x[0];agent = x[1];adr = x[2];n_adr = x[3];codes = x[4]
-    if not isinstance(n_adr,dict):
-        return adr
-    else:
-        y = split_list(r'[,，]',adr)
-        adr1 = y.copy()
-        for i,k in enumerate(n_adr):
-            by_agent = any([k in ag for ag in re.findall(r'[\w+、]*\/[\w+]*',agent)]) # 找到代理人格式 'XX、XX/XX_123123'
-            if by_agent and k in adr: # remove user's address when user with agent 用户有代理人就不要地址
-                y = list(filter(lambda x:not k in x,y))
-            if type(n_adr) == dict and not k in adr and k in user and not by_agent:
-                y += [k+adr_tag+n_adr.get(k)] # append address by rules 输出地址格式
-        adr2 = y.copy()
-        adr =  '，'.join(list(filter(None, y)))
-        if Counter(adr1) != Counter(adr2) and flag_check_jdocs and adr:print_log('>>> 【%s】成功复制判决书地址=>【%s】'%(codes,adr))
-    return adr
-
-def copy_users_compare(jrow,df,errs=list('    ')):
-    '''copy users and check users completement
-    errs=['【OA无用户记录】','【用户错别字】','【字段重复】','【系列案】']
-    如下对比：
-    不相交，OA无用户记录
-    判断字段重复,输出重复的内容
-    比例确定怀疑用户错别字，判别不了直接正常输出
-    判决书多于当前案件,认为是系列案
-    判决书少于当前案件,当前案件缺部分地址
-    '''
-    
-    code0 = str(df['案号']).strip()
-    code1 = str(df['原一审案号']).strip()
-    jcode = str(jrow['判决书源号']).strip()
-    x = Counter(user_to_list(df['当事人'])) # 当前案件
-    y = Counter(list(jrow['new_adr'].keys())) # 判决书
-    rxy = len(list((x&y).elements()))/len(list((x|y).elements()))
-    rxyx = len(list((x&y).elements()))/len(list(x.elements()))
-    rxyy = len(list((x&y).elements()))/len(list(y.elements()))
-    if flag_print:
-        print('x=',x);print('y=',y);print('rxy=',rxy)
-        print('rxyx=',rxyx);print('rxyy=',rxyy)
-    if rxy == 0: # 不相交，完全无关
-        return errs[0]
-    if max(x.values()) > 1 or max(y.values()) > 1: # 有字段重复
-        xdu = [k for k,v in x.items() if v > 1] # 重复的内容
-        ydu = [k for k,v in y.items() if v > 1]
-        print_log('>>> 用户有字段重复【%s】-【案件:%s】 vs 【判决书:%s】'
-                  %("{0:.0%}".format(rxy),xdu,ydu))
-        return errs[2]
-    if rxy == 1: # 完全匹配
-        return df['当事人']
-    if 0 < rxy < 1: # 错别字
-        dx = list((x-y).elements())
-        dy = list((y-x).elements())
-        xx = Counter(''.join(dx))
-        yy = Counter(''.join(dy))
-        rxxyy = len(list(xx&yy.keys()))/len(list(xx|yy.keys()))
-        if flag_print:print('rxxyy=',rxxyy)
-        if rxxyy >= .6:
-            print_log('>>> 觉得有【错别字率 %s】->【案件:%s vs 判决书:%s】'
-                      %("{0:.0%}".format(1-rxxyy),dx,dy))
-            return errs[1]
-        elif rxxyy >= .2:
-            print_log('>>> 觉得不好判断当正常处理【差异率 %s】vs【相同范围:%s】->【差异范围:案件:%s vs 判决书:%s】 '
-                          %("{0:.0%}".format(1-rxxyy),
-                            list((x&y).elements()),
-                            dx,dy))
-            return df['当事人']
-    if rxyx > .8:
-        print_log('>>> 案件 %s人 < 判决书  %s人'%(len(x),len(y)))
-        if jcode != code1:# 系列案
-            print_log('>>> 觉得是【系列案,判决书人员 %s 多出地址】'%(list((y-x).elements())))
-            return errs[3]
-        else:
-            return df['当事人']
-    elif rxyy > .8:
-        print_log('>>> 案件 %s人 > 判决书 %s人'%(len(x),len(y)))
-        print_log('>>> 觉得有【当前案件人员 %s 缺地址】'%(list((x-y).elements())))
-        return df['当事人']
-    return errs[0]
-    
-#%%
-
-def save_jdocs_infos(x):
-    '''save remane jdocs'''
-    try:
-        x = x.fillna('')
-        save_adjust_xlsx(x,file=address_tmp_xlsx,textfit=('判决书源号','new_adr'))
-#        x.to_excel(address_tmp_xlsx,index=False)
-    except Exception as e:
-        print_log('%s <= 保存失败,请检查... %s'%(address_tmp_xlsx,e))
-  
-
-def new_adr_format(n_adr):
-    y=[]
-    for i,k in enumerate(n_adr):
-        y += [k+adr_tag+n_adr.get(k)]
-    return ('，'.join(list(filter(None, y))))
-      
-def copy_rows_user_func(dfj,dfo):
-    
-    def copy_rows_adr1(x,n_adr):
-        ''' copy jdocs address to address column
-            格式:['当事人','诉讼代理人','地址','new_adr','案号']
-            同时排除已有代理人的信息
-        ''' 
-        user = x['当事人'];agent = x['诉讼代理人'];adr = x['地址']; codes = x['案号']
-        if not isinstance(n_adr,dict):
-            return adr
-        else:
-            y = split_list(r'[,，]',adr)
-            adr1 = y.copy()
-            for i,k in enumerate(n_adr):
-                by_agent = any([k in ag for ag in re.findall(r'[\w+、]*\/[\w+]*',agent)]) # 找到代理人格式 'XX、XX/XX_123123'
-                if by_agent and k in adr: # remove user's address when user with agent 用户有代理人就不要地址
-                    y = list(filter(lambda x:not k in x,y))
-                if type(n_adr) == dict and not k in adr and k in user and not by_agent:
-                    y += [k+adr_tag+n_adr.get(k)] # append address by rules 输出地址格式
-            adr2 = y.copy()
-            adr =  '，'.join(list(filter(None, y)))
-            if Counter(adr1) != Counter(adr2) and flag_check_jdocs and adr:print_log('>>> 【%s】成功复制判决书地址=>【%s】'%(codes,adr))
-        return adr
-    
-    '''copy users line regard adr user'''
-    errs = ['【OA无用户记录】','【用户错别字】','【字段重复】','【系列案】']
-    
-    dfo['判决书源号'] = ''
-    
-    for (i,dfor) in dfo.iterrows():
-        for (j,dfjr) in dfj.iterrows():
-            code0 = str(dfor['案号']).strip()
-            code1 = str(dfor['原一审案号']).strip()
-            jcode = str(dfjr['判决书源号']).strip()
-            n_adr = dfjr['new_adr']
-            if isinstance(n_adr,dict):
-                if not n_adr:continue# 提取jdocs字段失败
-                if code1 == jcode:# 同案号，则找到内容
-                    print_log('\n>>> 找到信息_案号=%s__源号=%s__判决书源号=%s'%(code0,code1,jcode))
-                    dfo.loc[i,'地址'] = copy_rows_adr1(dfor,n_adr)
-                    dfo.loc[i,'判决书源号'] = jcode
-                    break
-                else:#[::-1] # 没案号
-                    tag1 = copy_users_compare(dfjr,dfor,errs)
-                    if tag1 not in errs:
-                        print_log('\n>>> 找到信息_案号=%s__源号=%s__判决书源号=%s'%(code0,code1,jcode))
-                        dfo.loc[i,'地址']= copy_rows_adr1(dfor,n_adr)
-                        dfo.loc[i,'判决书源号'] = jcode
-                        break
-                    else:
-                        pass
-    save_jdocs_infos(dfj)
-    return dfo
-    
 
 #%%
 
-def rename_jdocs_codes_x(d,r,old_codes):
-    '''add jdoc current case codes for reference 判决书改名，包括源案号'''
-    if str(r[old_codes]) in str(d):
-        nd = os.path.join(os.path.split(d)[0],'判决书_'+str(r['案号']) +'_原_'+ str(r[old_codes]) + '.docx')
-        if(d == nd):
-            return d
-        try:
-            if os.path.exists(nd):
-                os.remove(nd)
-#            if '_原_' in d:
-#                shutil.copyfile(d,nd)
-            else:
-                os.rename(d,nd)
-                print('>>> 重命名判决书 => ',nd)
-        except Exception as e:
-            print_log(e)
-        return nd
-    return d
-
-def rename_jdocs_codes(dfo):
-    '''rename with new codes'''
-    old_codes='判决书源号'
-    docs = glob(parse_subpath(jdocs_path,'判决书_*.docx'))
-    df = dfo[dfo[old_codes] != '']
-    if docs:
-        for doc in docs:
-            for (i,dfr) in df.iterrows():
-                if check_codes(dfr[old_codes]) and str(dfr[old_codes]) in doc:
-                    rename_jdocs_codes_x(doc,dfr,old_codes)
-                    break
-    return None
-
-def fill_infos_func(dfj,dfo):
-    '''填充信息并处理系列案'''
-    dd = dfo[['适用程序','当事人']][dfo['适用程序'].str.len()>2].drop_duplicates().copy()
-    dfoo = dfo.copy()
-    for tag1,tag2 in zip(dd['适用程序'].to_list(),dd['当事人'].to_list()):
-        serise = dfo[(dfo['适用程序']==tag1)&(dfo['当事人']==tag2)]
-        if len(serise) > 0:
-            ss = serise.iloc[0].copy()
-            if '_集合' not in ss['适用程序']:
-                print_log('>>> 发现系列案：',serise['案号'].to_list())
-                sn0 = serise['案号'].to_list()
-                sn = [re.search(r'\d+(?=号)|$',x).group(0) for x in sn0]
-                if sn[0] != sn[-1]:
-                    ss['案号'] = re.sub(r'\d+(?=号)','%s-%s'%(sn[0],sn[-1]),sn0[0])
-                ss['适用程序'] = ss['适用程序']+'_集合'
-                dfoo = pd.concat([ dfoo[~dfoo.isin(serise).all(1)],
-                                               ss.to_frame().T])
-    dfo = dfoo
-    dfo = copy_rows_user_func(dfj,dfo)
-    rename_jdocs_codes(dfo)
-    return dfo
-
-#%% df process steps
-
-def df_read_fix(df):
-    '''fix codes remove error format 处理案号格式'''
-    df[['立案日期','案号','主审法官','当事人']] = df[['立案日期','案号','主审法官','当事人']].replace('',float('nan'))
-    df.dropna(how='any',subset=['立案日期','案号','主审法官','当事人'],inplace=True)
-    df['原一审案号'] = df['原一审案号'].fillna('')
-    df[['案号','原一审案号']] = df[['案号','原一审案号']].applymap(case_codes_fix)
-    return df
-
-def df_fill_infos(dfo):
-    '''main fill jdocs infos'''
-    if len(dfo) == 0:return dfo
-    docs = glob(parse_subpath(jdocs_path,'*.docx')) # get jdocs
-    if not docs:return dfo
-    dfj = get_all_jdocs(docs)
-    global dd
-    dd = dfj
-    if len(dfj) == 0:
-        print_log('>>> 没有找到判决书...不处理！！');return dfo
-    dfn = fill_infos_func(dfj,dfo)
-    dfn = titles_resort(dfn,titles_main)
-    try:
-        if flag_fill_jdocs_infos:
-            dfo = save_adjust_xlsx(dfn,data_xlsx)
-    except PermissionError:
-        print_log('>>> %s 文件已打开...填充判决书地址失败！！...请关闭并重新执行'%data_xlsx)
-    return dfo
-
-def df_make_subset(df,oa_new=0):
-    '''
-    cut orgin data into subset by conditions
-    '''
-    dcn = case_codes_fix(data_case_codes)
-    date_range = data_date_range
-    last_lines = data_last_lines
-    if dcn:  # 多个指定案号例如: （2018）哈哈1234号,（2018）哈哈3333号
-        df = df[df['案号'].isin(split_list('[,，;；]',dcn)) | df['原一审案号'].isin(split_list('[,，;；]',dcn))]
-    elif ':' in date_range:
-        print_log('\n>>> 预定读取【%s】'%date_range)
-        df['立案日期'] = to_datetime(df['立案日期'])
-        df.sort_values(by=['立案日期'],inplace=True)
-        try:
-            dats = date_range.split(':')
-            x = parse_datetime(dats[0]);y = parse_datetime(dats[1])
-            x1 = df['立案日期'].iloc[0].to_pydatetime()
-            y1 = df['立案日期'].iloc[-1].to_pydatetime()
-            t1 = min(x,y); t2 = max(x,y)
-            t1 = max(t1,x1);t2 = min(t2,y1)
-            date_start = t1 if t1 else x1
-            date_end = t2 if t2 else y1
-            df = df[(df['立案日期']>=date_start)&(df['立案日期']<=date_end)].copy() #这里数据分片有警告
-            df['立案日期'] = df['立案日期'].astype(str)
-            return df
-        except Exception as e:
-            print_log('>>> 日期异常',e)
-    elif last_lines:
-        df = df.tail(last_lines)
-    return df
-
-#%%
-def df_oa_append(dfo):
-    '''main fill OA data into df data and mark new add'''
-    if flag_append_oa:
-        if not os.path.exists(data_oa_xlsx):
-            print_log('>>> 没有找到OA模板 %s...不处理！！'%data_oa_xlsx);return dfo
-        dfoa = read_excel(data_oa_xlsx,sort=False)[titles_oa].fillna('') # only oa columns
-        df1 = dfo.copy()
-        df2 = dfoa.copy()
-        
-        if '适用程序' not in dfo.columns:
-            dfo['适用程序'] = 0
-        dfoa = df_make_subset(dfoa,oa_new=1) # subset by columns
-        dfoa.rename(columns={'承办人':'主审法官'},inplace=True)
-        dfoa = df_read_fix(dfoa) # fix empty data columns
-        dfoa['add_index'] = 'new'
-        dfo['add_index'] = 'old'
-        dfors = dfo['适用程序']
-        
-        tags = list(dfors[dfors.str.len()>2&dfors.apply(lambda x:'Done' in x)].unique())
-        tags = [t.replace('_集合','') for t in tags]
-        
-        for i,df2r in dfoa.iterrows():
-            if df2r['适用程序'] in tags:continue
-            dfo = dfo.append(df2r,sort=False)
-                
-        dfo.fillna('',inplace=True)
-        dfo.drop_duplicates(['立案日期','案号'],keep='first',inplace=True)
-        
-        dfo.sort_values(by=['立案日期','案号'],inplace=True)
-        df_noa = dfo[dfo['add_index'] == 'new']
-        print_log('>>> 所有OA记录【%s条】...'%len(dfoa))
-        print_log('>>> 原Data记录【%s条】...'%len(dfo))
-        print_log('>>> 实际添加【%s条】新OA记录...'%len(df_noa))
-        if len(df_noa):
-            dd = str(df_noa['立案日期'].iloc[0]) +':'+df_noa['立案日期'].iloc[-1]
-            print_log('>>> 实际添加【%s】'%dd)
-        if any(dfo['add_index'] == 'new'):
-            dfo = titles_resort(dfo,titles_main)
-            try:dfo = save_adjust_xlsx(dfo,data_xlsx)
-            except PermissionError:print_log('>>> %s 文件已打开...填充OA数据失败！！。。。请关闭并重新执行'%data_xlsx)
-    return dfo
-
-def df_check_format(x):
-    '''check data address and agent format with check flag'''
-    if x['aname']!='' and not re.search(r'[\/_]',x['aname']):
-        print_log('>>> 记录\'%s\'---- 【诉讼代理人】格式 \'%s\' 不正确,如无请留空,请自行修改...'%(x['number'],x['aname']))
-    if x['address']!='' and not re.search(r'\/地址[:：]',x['address']):
-        print_log('>>> 记录\'%s\'---- 【地址】格式 \'%s\' 不正确,如无请留空,请自行修改...'%(x['number'],x['address']))
-    return x
-
-
-
-#%% df tramsfrom functions
-def clean_rows_aname(x,names):
-    '''Clean agent name for agent to match address's agent name'''
-    if names:
-        for name in names:
-            if not check_cn_str(name):continue
-            if name in x:
-#                if flag_print: print('A=%s,B=%s'%(x,name))
-                x = name;break
-    x = re.sub(r'_.*','',x)
-    x = re.sub(path_names_clean,'',x)
-    return x
-
-def clean_rows_adr(adr):
-    '''clean adr format'''
-    y = split_list(r'[,，]',adr)
-    if y:
-        y = list(map(lambda x: x if re.search(r'\/地址[:：]',x) else adr_tag + x,y))
-        adr = '，'.join(list(filter(None, y)))
-    return adr
-
-def make_adr(adr,fix_aname=[]):
-    '''
-    clean_aname:合并标识,此处如果没律师，则代理人就是自己
-    fix_aname:修正名字错误
-    Returns:
-          level_0       address        clean_aname
-    0       44      XX市XX镇XXX村          张三
-    1       44      XXX市XX区XXX          B律师
-    '''
-    adr = adr[adr != '']
-    adr = adr.str.strip().str.split(r'[,，。]',expand=True).stack()
-    adr = adr.str.strip().apply(lambda x:clean_rows_adr(x))
-    adr = adr.str.strip().str.split(r'\/地址[:：]',expand=True).fillna('')
-    adr.columns = ['aname','address']
-    adr['clean_aname'] = adr['aname'].str.strip().apply(lambda x:clean_rows_aname(x,fix_aname)) # clean adr
-    adr = adr.reset_index().drop(['level_1','aname'],axis=1)
-    return adr
-
-def make_agent(agent,fix_aname=[]):
-    ''' 
-    fix_aname:修正名字错误,假如律师(aname)有多个,则选择第一个律师作为合并标识(clean_aname)，注意没有律师的合并就是自己(uname)做代理人
-    Returns:
-       level_0       uname            aname              clean_aname
-    0       44         张三          A律师_123213123                A律师
-    1       44         李四                                       
-    2       44         王五       B律师_123123132123、C律师_123123   B律师
-    '''
-    agent = agent[agent != '']
-    agent = agent.str.strip().str.split(r'[,，。]',expand=True).stack() #Series
-    agent = agent.str.strip().str.split(r'\/',expand=True).fillna('') #DataFrame
-    agent.columns = ['uname','aname']
-    agent['clean_aname'] = agent['aname'].str.strip().apply(lambda x: clean_rows_aname(x,fix_aname))
-    dd_l = agent['uname'].str.strip().str.split(r'、',expand=True).stack().to_frame(name = 'uname').reset_index()
-    dd_r = agent[agent.columns.difference(['uname'])].reset_index()
-    agent = merge(dd_l,dd_r,how='outer',on=['level_0','level_1']).drop(['level_1','level_2'],axis=1).fillna('')
-    return agent
-
-def merge_user(user,agent):
-    '''合并后以uname为主,clean_aname是律师标识
-    Returns:
-       level_0       uname            aname              clean_aname
-    0       44         张三          A律师_123213123                A律师
-    2       44         王五       B律师_123123132123、C律师_123123   B律师
-    '''
-    return merge(user,agent,how='left',on=['level_0','uname']).fillna('')
-
-def merge_usr_agent_adr(agent,adr):
-    ''' clean_aname 去除nan,保留曾用名
-    '''
-    agent['clean_aname'].replace('',float('nan'),inplace=True)
-    agent['clean_aname'] = agent['clean_aname'].fillna(agent['uname']).replace(path_names_clean,'')
-    adr['clean_aname'] = adr['clean_aname'].apply(lambda x: clean_rows_aname(x,agent['clean_aname'].tolist()))
-    tb = merge(agent,adr,how='outer',on=['level_0','clean_aname']).fillna('')
-    tb.dropna(how='all',subset=['uname', 'aname'],inplace=True)
-    return tb
-
-def reclean_data(tb):
-    tg = tb.groupby(['level_0','clean_aname','aname','address'])['uname'].apply(lambda x: '、'.join(x.astype(str))).reset_index()
-    glist = tg['uname'].str.split(r'、',expand=True).stack().values.tolist()
-    rest = tb[tb['uname'].isin(glist) == False]
-    x = concat([rest,tg],axis=0,sort=True)
-    return x
-
-def sort_data(x,number):
-    x = x[['level_0','uname','aname','address']].sort_values(by=['level_0'])
-    x = merge(number,x,how='right',on=['level_0']).drop(['level_0'],axis=1).fillna('')
-    return x
-
-#%% main processing stream 主数据流程
-
-try:
-    if not os.path.exists(cfgfile):
-        '''生成默认配置'''
-        write_config()
-    conf_list = read_config()
-except Exception as e:
-    print_log('>>> 配置文件出错 %s ,删除...'%e)
-    if os.path.exists(cfgfile):
-        os.remove(cfgfile)
-    try:
-        write_config()
-        conf_list = read_config()
-    except Exception as e:
-        '''这里可以添加配置问题预判问题'''
-        print_log('>>> 配置文件再次生成失败 %s ...'%e)
-        set_default_value(data_date_range = '')
-        
 print_log('''>>> 正在处理...
     主表路径 = %s
     指定案件 = %s
     指定日期 = %s
     指定条数 = %s
     '''%(os.path.abspath(data_xlsx),
-        conf_list.get('data_case_codes'),
-        conf_list.get('data_date_range'),
-        conf_list.get('data_last_lines'),
+        data_case_codes,
+        data_date_range,
+        data_last_lines,
         )
     )
-    
 
 if not os.path.exists(data_xlsx):
-    save_adjust_xlsx(DataFrame(columns=titles_main),data_xlsx,width=40)
-    print_log('>>> %s 记录文件不存在...重新生成'%(data_xlsx))
-
-dfo = read_excel(data_xlsx,sort=False).fillna('') #真正读取记录位置
-dfo = df_read_fix(dfo) # fix empty data columns
-dfo = df_oa_append(dfo) # append oa data
-
-dfo = df_fill_infos(dfo) # fill jdocs infos
-dfo = df_make_subset(dfo)
-df = titles_trans_columns(dfo,titles_cn) # 中译英方便后面处理
+    ut.save_adjust_xlsx(pd.DataFrame(columns=titles_main),data_xlsx,width=40)
+    print_log('>>> %s 文件不存在...重新生成'%(data_xlsx))
 
+#%%
+df = pd.read_excel(data_xlsx,sort=False).fillna('') #真正读取记录位置
+df = df_read_fix(df);df # fix empty data columns
+df,ocodes = df_oa_append(df) # append oa data and sava # 合并前记录案号
+df = merge_group_cases(df);df # merge group and save
+df = df[df['原一审案号'].isin(ocodes)];df # 合并后找回记录案号
+df = df_fill_infos(df) # 填充判决书内容 # filled and save
+df = ut.titles_trans_columns(df,titles_cn);df # 中译英方便后面处理
 if flag_check_postal:
     df.apply(lambda x:df_check_format(x), axis=1)
-    
-print_log('\n>>> ***将要打印Data记录【---%s条----】...'%len(df))
 
 if 0<len(df)<10:
-    print_log('>>> ***将要打印 => %s '%df['number'].to_list())
+    print_log('>>> 将要打印【%s条】=> %s '%(len(df),
+                                       df['number'].to_list()))
 
 #%% df tramsfrom stream 数据转换流程
 
-if len(df) and flag_to_postal:  
+if len(df) and flag_to_postal:
     try:
         print_log('\n>>> 开始生成新数据 data_main_temp... ')
         '''获取 datetime|number'''
@@ -820,22 +133,19 @@ def sort_data(x,number):
             print_log('>>> 缺失【代理人】和【地址】...正在处理...')
             agent_adr.index.name = 'level_0'
             agent_adr.reset_index(inplace=True)
-            df_x = merge(user,agent_adr,how='left',on=['level_0']).fillna('')
+            df_x = pd.merge(user,agent_adr,how='left',on=['level_0']).fillna('')
             df_x = sort_data(df_x,number)
 
         if len(df_x):
             data_tmp = os.path.splitext(data_xlsx)[0]+"_tmp.xlsx"
             df_save = df_x.copy()
-            df_save.columns = titles_trans(df_save.columns.tolist())
-            try:df_save = save_adjust_xlsx(df_save,data_tmp,width=40)
-            except PermissionError: print_log('>>> %s 文件已打开...请手动关闭并重新执行...保存失败'%data_tmp)
+            df_save.columns = ut.titles_switch(df_save.columns.tolist())
+            df_save = ut.save_adjust_xlsx(df_save,data_tmp,width=40)
 
-        
     except Exception as e:
-        raise e
-        print_log('>>> 错误 \'%s\' 生成数据失败,请检查源 \'%s\' 文件...退出...'%(e,data_xlsx));sys.exit()
+        input_exit('>>> 错误 \'%s\' 生成数据失败,请检查源 \'%s\' 文件...退出...'%(e,data_xlsx))
 
-#%% generate postal sheets 生成邮单流程
+#%% print postal sheets 打印邮单流程
 
 def re_write_text(x):
     '''re-write postal sheet content from df rows'''
@@ -848,6 +158,7 @@ def re_write_text(x):
     number_text = str(x['number'])
     address_text = str(x['address'])
 
+    # 以下填充均对于模板sheet.doc
     try:
         para = doc.paragraphs[9]  # No.9 line is agent name
         text = re.sub(r'[\w（）()]+',agent_text,para.text)
@@ -857,8 +168,9 @@ def re_write_text(x):
         text = re.sub(r'代 \w+',user_text,para.text)
         para.clear().add_run(text)
 
+
         para = doc.paragraphs[13]  # No.13 line is number and address
-        text = re.sub(path_code_ix,number_text,para.text)
+        text = re.sub(ut.path_code_ix,number_text,para.text)
         para.clear().add_run(text)
         text = re.sub(r'(?<=\s)\w+市.*',address_text,para.text)
         para.clear().add_run(text)
@@ -868,44 +180,39 @@ def re_write_text(x):
     sheet_file = number_text+'_'+agent_text+'_'+user_text+'_'+address_text+'.docx'
     sheet_file = re.sub(r'[\/\\\:\*\?\"\<\>]',' ',sheet_file) # keep rename legal
 
-    if os.path.exists(parse_subpath(postal_path,sheet_file)):
-        if flag_check_postal:print_log('>>> 邮单已存在！！！ <= %s'%sheet_file)
+    if os.path.exists(ut.parse_subpath(postal_path,sheet_file)):
+        if ut.flag_check_postal:print_log('>>> 邮单已存在！！！ <= %s'%sheet_file)
         return ''
 
     if not agent_text:
         if flag_check_postal:print_log('>>> 【代理人】暂缺！！！ <= %s'%sheet_file)
         return ''
-    
+
     if not address_text:
         if flag_check_postal:print_log('>>> 【地址】暂缺！！！ <= %s'%sheet_file)
         return ''
     try:
-        doc.save(parse_subpath(postal_path,sheet_file))
+        doc.save(ut.parse_subpath(postal_path,sheet_file))
         print_log('>>> 已生成邮单 => %s'%sheet_file)
         return sheet_file
     except Exception as e:
-        print_log('>>> 生成失败！！！ => %s'%e)
+        input_exit('>>> 生成失败！！！ => %s ...任意键退出'%e)
     return ''
 
+
 if len(df) and flag_to_postal:
-    print_log('\n>>> 正在输出邮单...')
+    print_log('\n>>> 正在输出邮单...\n')
     if not os.path.exists(sheet_docx):
-        input('>>> 没有找到邮单模板 %s...任意键退出'%sheet_docx);sys.exit()
+        input_exit('>>> 没有找到邮单模板 %s...任意键退出'%sheet_docx)
     df_p = df_x.apply(re_write_text,axis = 1)
     count = len(df_p[df_p != ''])
     codes = df_x['number'].astype(str)
     dates = df_x['datetime'].astype(str)
     codesrange = codes.iloc[0] if codes.iloc[0] == codes.iloc[-1] else ('%s:%s'%(codes.iloc[0],codes.iloc[-1]))
     datesrange = dates.iloc[0] if dates.iloc[0] == dates.iloc[-1] else ('%s:%s'%(dates.iloc[0],dates.iloc[-1]))
-    print_log('>>> 最终生成邮单【%s条】范围: 【%s】日期:【%s】'%(count,codesrange,datesrange))
-    
+    print_log('\n>>> 最终生成邮单【%s条】范围: 【%s】日期:【%s】'%(count,codesrange,datesrange))
+
     del df_x,df_p,codes,dates
     del user,number,agent,adr,df,agent_adr,opt
-    
-#%% main finish 结束所有
-
-print_log('>>> 全部完成,可以回顾记录...任意键退出')
-#input('>>> 全部完成,可以回顾记录...任意键退出');sys.exit()
-
-
 
+input_exit('>>> 全部完成,可以回顾记录...任意键退出')
diff --git a/getjdocs.py b/getjdocs.py
new file mode 100644
index 0000000..6f07bba
--- /dev/null
+++ b/getjdocs.py
@@ -0,0 +1,126 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 11 11:44:28 2019
+
+@author: autol
+"""
+
+import os,re
+from glob import glob
+import pandas as pd
+from docx import Document
+from util import check_codes,case_codes_fix,split_list,parse_subpath
+from globalvar import *
+
+#%% 读取判决书jdocs代码
+
+def read_jdocs_table(tables):
+    codes = ''
+    for table in tables:
+        for row in table.rows:
+            for cell in row.cells:
+                for paragraph in cell.paragraphs:
+                    x = paragraph.text
+                    if re.search(path_code_ix,x) and len(x) < 25:
+                        codes = case_codes_fix(x)
+                        break
+    return codes
+
+def get_jdocs_infos(doc,lines = 20):# search at least 20 lines docs
+    '''get pre address from judgment docs, return docs pre code and address'''
+    adrs = {};codes = ''
+    try:tables = Document(doc).tables
+    except Exception as e:
+        print_log('读取错误 %s ,docx文档问题,请重新另存为,或关闭已打开的docx文档'%e)
+        return codes,adrs
+    if tables: codes = read_jdocs_table(tables)
+    paras = Document(doc).paragraphs
+    if not paras:
+        return codes,adrs
+    if len(paras) > 20: # 多于20行就扫描一半内容
+        lines = int(len(paras)/2)
+    parass = paras[:lines]
+    for i,para in enumerate(parass):
+        x = para.text.strip()
+        if len(x) > 150: continue # 段落大于150字就跳过
+        if re.search(path_code_ix,x) and len(x) < 25:
+            codes = case_codes_fix(x);continue # codes
+        cond3 = re.search(r'法定代表|诉讼|代理人|判决|律师|请求|证据|辩称|辩论|不服',x) # 跳过非人员信息
+        cond4 = re.search(r'上市|省略|区别|借款|保证|签订',x) # 跳过非人员信息,模糊
+        cond1 = re.search(r'(?<=[：:]).*?(?=[,，。])',x) # 通过间隔提取
+        cond2 = re.search(r'.*?[省市州县区乡镇村]',x) # 地址规则
+        if cond3:continue
+        if cond4:continue
+        if cond1 and cond2:
+            '''
+            Todo: get user and address
+            Usage: '被上诉人（原审被告）：张三，男，1977年7月7日出生，汉族，住XX自治区(省)XX市XX区1212。现住XX省XX市XX区3434'
+            -> {'张三': 'XX省XX市XX区3434'}
+            '''
+            try:
+                name = re.search(r'(?<=[：:]).*?(?=[,，。])|$',x).group(0).strip()
+                name = re.sub(r'[(（][下称|原名|反诉|变更前].*?[）)]','',name) # filter some special names,notice here will add some words for filter
+                z = split_list(r'[,，:：.。]',x)
+                z = [re.sub(r'户[籍口]|居住|身份证|所在地|住所地?|住址?|^[现原]住?','',y) for y in z if re.search(r'.*?[省市州县区乡镇村]',y)][-1] # 几个地址选最后一个 remain only address
+                adr = {name:''.join(z)}
+                adrs.update(adr)
+            except Exception as e:
+                print_log('获取信息失败 =>',e)
+    return codes,adrs
+
+def rename_jdoc_x(doc,codes):
+    '''rename only judgment doc files'''
+    jdoc_name = os.path.join(os.path.split(doc)[0],'判决书_'+codes+'.docx')
+    if not codes in doc:# os.path.exists(jdoc_name)
+        try:
+            os.rename(doc,jdoc_name)
+            return 1
+        except Exception as e:
+            print_log(e)
+            os.remove(doc)
+            return 0
+    return 0
+
+def get_all_jdocs(docs):
+    '''主要获取的入口'''
+    numlist=[]; nadr = []
+    for doc in docs:
+        codes,adrs = get_jdocs_infos(doc)
+        if codes:
+            rename_jdoc_x(doc,codes)
+        numlist.append(codes)
+        nadr.append(adrs)
+        if flag_check_jdocs and codes:
+            print_log('>>> 判决书信息 【%s】-【%s人】-%s \n'%(codes,len(adrs),adrs))
+    numlist = list(map(case_codes_fix,numlist))
+    return pd.DataFrame({'判决书源号':numlist,'new_adr':nadr})
+
+
+def rename_jdocs_codes_x(d,r,old_codes):
+    '''add jdoc current case codes for reference 判决书改名，包括源案号'''
+    if str(r[old_codes]) in str(d):
+        nd = os.path.join(os.path.split(d)[0],'判决书_'+str(r['案号']) +'_原_'+ str(r[old_codes]) + '.docx')
+        if(d == nd): # 相同则返回
+            return d
+        try: # 不同则命名，检测源文件存在
+            if os.path.exists(nd):
+                os.remove(nd)
+            os.rename(d,nd)
+            print('>>> 重命名判决书 => ',nd)
+        except Exception as e:
+            print_log(e)
+        return nd
+    return d
+
+def rename_jdocs_codes(dfo):
+    '''rename jdocs with new codes'''
+    old_codes='判决书源号'
+    docs = glob(parse_subpath(jdocs_path,'判决书_*.docx'))
+    df = dfo[dfo[old_codes] != '']
+    if docs:
+        for doc in docs:
+            for (i,dfr) in df.iterrows():
+                if check_codes(dfr[old_codes]) and str(dfr[old_codes]) in doc:
+                    rename_jdocs_codes_x(doc,dfr,old_codes)
+                    break
+    return None
\ No newline at end of file
diff --git a/globalvar.py b/globalvar.py
new file mode 100644
index 0000000..0192a7c
--- /dev/null
+++ b/globalvar.py
@@ -0,0 +1,70 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 11 15:23:05 2019
+
+@author: autol
+"""
+#%%
+import os,re,sys
+import pandas as pd
+from configure import write_config,read_config
+
+#%%
+pd.set_option('max_colwidth',500)
+pd.set_option('max_rows', 50)
+pd.set_option('max_columns',50)
+
+#%% global variable
+
+titles_main = ['立案日期','适用程序','案号','原一审案号','判决书源号','主审法官','当事人','诉讼代理人','地址',]
+titles_oa = ['立案日期','案号','原一审案号','承办人','当事人','适用程序']
+titles_cn = ['立案日期','案号','当事人','诉讼代理人','地址']
+titles_en = ['datetime','number','uname','aname','address']
+path_names_clean = re.compile(r'[^A-Za-z\u4e00-\u9fa5（）()：]') # remain only name including old name 包括括号冒号
+search_names_phone = lambda x: re.search(r'[\w（）()：:]+\_\d+',x)  # phone numbers
+path_code_ix = re.compile(r'[(（][0-9]+[)）].*?号') # case numbers
+postal_path = os.path.join('.','postal')
+jdocs_path = os.path.join('.','jdocs')
+adr_tag = '/地址：'
+done_tag = '_集合'
+usrtag = r'申请人|被申请人|原告|被告|原审被告|上诉人|被上诉人|第三人|原审诉讼地位|申请再审人|被申请再审人' # 当事人抬头标识
+
+
+#%% print_log log
+
+logname = 'log.txt'
+
+def print_log(*args, **kwargs):
+    print(*args, **kwargs)
+    with open(logname, "a",encoding='utf-8') as file:
+        print(*args, **kwargs, file=file)
+
+def input_exit(*args, **kwargs):
+    input(*args, **kwargs);sys.exit()
+    return 1
+
+if os.path.exists(logname):
+    os.remove(logname)
+
+#%% read configure global variable
+
+def init_var():
+    cfgfile = 'conf.txt'
+    try:
+        if not os.path.exists(cfgfile): write_config(cfgfile) # 生成默认配置
+        var = pd.Series(read_config(cfgfile));var
+    except Exception as e:
+        print_log('>>> 配置文件出错 %s ,删除...'%e)
+        if os.path.exists(cfgfile):
+            os.remove(cfgfile)
+        try:
+            write_config()
+            var = pd.Series(read_config());var
+        except Exception as e:
+            '''这里可以添加配置问题预判问题'''
+            input_exit('>>> 配置文件再次生成失败 %s ...'%e)
+#    print(var)
+    return var
+
+var = init_var()
+locals().update(var.to_dict()) # 设置读取的全局变量
diff --git a/rqlist.md b/rqlist.md
new file mode 100644
index 0000000..106295a
--- /dev/null
+++ b/rqlist.md
@@ -0,0 +1,5 @@
+numpy==1.16.2
+pandas==0.24.2
+python-docx==0.8.10
+StyleFrame==2.0.3
+PyInstaller==3.4
diff --git a/util.py b/util.py
new file mode 100644
index 0000000..84be0e1
--- /dev/null
+++ b/util.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Sep 11 11:29:47 2019
+
+@author: autol
+"""
+
+import os,re,datetime
+from StyleFrame import StyleFrame, Styler
+from globalvar import *
+
+#%% base utils
+
+def split_list(regex,L):
+    return list(filter(None,re.split(regex,L)))
+
+def user_to_list(u):
+    '''get name list from user string
+    Usage: '申请人:张xx, 被申请人:李xx, 原审被告:罗xx（又名罗aa）'
+    -> ['张xx', '李xx', '罗xx（又名罗aa）']
+    '''
+    u = split_list(r'[:、,，]',u)
+    return [x for x in u if not re.search(usrtag,x)]
+
+def check_codes(x):
+    '''check cases codes here'''
+    return bool(re.search(path_code_ix.pattern,str(x)))
+
+def case_codes_fix(x):
+    '''fix string with chinese codes format
+    Usage: 'dsfdsf(2018)中文中文248号sdfsdf' -> '（2018）中文中文248号'
+    '''
+    x = str(x)
+    x = re.search(path_code_ix.pattern+r'|$',x).group().strip().replace(' ','')
+    x = x.replace('(','（').replace(')','）')
+    return x
+
+def expand_codes(xxx):
+    '''
+    对于系列案号处理，展开案号
+    Usage: ['(2018)中文中文111、248号','(2018)中文中文333、444号']
+    -> ['(2018)中文中文111号', '(2018)中文中文248号', '(2018)中文中文333号', '(2018)中文中文444号'] '''
+    cc =[]
+    for xx in xxx:
+        aa = re.split('、',re.search(r'\d+、.*\d+(?=号)|$',xx).group(0))
+        bb = [re.sub(r'\d+、.*\d+(?=号)',x,xx) for x in aa]
+        cc+=bb
+    return cc
+
+def parse_subpath(path,file):
+    '''make subpath'''
+    if not os.path.exists(path):
+        os.mkdir(path)
+    return os.path.join(path,file)
+
+def check_cn_str(x):
+    '''check if string contain chinese'''
+    return bool(re.search(r'[\u4e00-\u9fa5]',str(x)))
+
+def parse_datetime(date):
+    '''datetime transform'''
+    try:date = datetime.datetime.strptime(date,'%Y-%m-%d')
+    except ValueError:print_log('时间范围格式有误,默认选取全部日期');date = ''
+    return date
+
+def titles_switch(df_list):
+    '''switch titles between Chinese and English'''
+    titles_cn2en = dict(zip(titles_cn, titles_en))
+    titles_en2cn = dict(zip(titles_en, titles_cn))
+    trans_cn_en = list(map(lambda x,y:(titles_cn2en if y else titles_en2cn).get(x),
+                           df_list,list(map(check_cn_str,df_list))))
+    return trans_cn_en
+
+def titles_trans_columns(df,titles):
+    '''sub-replace columns titles you want'''
+    titles_rest = df.drop(titles,axis=1).columns.tolist()
+    df = df[titles + titles_rest]
+    df.columns = titles_switch(titles) + titles_rest
+    return df
+
+def titles_resort(df,titles):
+    '''resort titles with orders'''
+    titles_rest = df.drop(titles,axis=1).columns.tolist()
+    return df[titles + titles_rest]
+
+#%% read func
+
+isStyleFrame = 1
+
+def save_adjust_xlsx(df,file,textfit=('当事人', '诉讼代理人', '地址'),width=60):
+    '''save and re-adjust excel format
+    with StyleFrame or not
+    '''
+    try:
+        print_log('>>> 保存文件 => 文件名 \'%s\''%file)
+        df = df.reset_index(drop='index').fillna('')
+        if isStyleFrame:
+            StyleFrame.A_FACTOR = 5
+            StyleFrame.P_FACTOR = 1.2
+            sf = StyleFrame(df,Styler(wrap_text = False, shrink_to_fit=True, font_size= 12))
+            if('add_index' in df.columns.tolist()):
+                sf.apply_style_by_indexes(indexes_to_style=sf[sf['add_index'] == 'new'],
+                                          styler_obj=Styler(bg_color='yellow'),
+                                          overwrite_default_style=False)
+                sf.apply_column_style(cols_to_style = textfit,
+                                      width = width,
+                                      styler_obj=Styler(wrap_text=False,shrink_to_fit=True))
+            else:
+                sf.set_column_width_dict(col_width_dict={textfit: width})
+            if len(df):
+                sf.to_excel(file,best_fit=sf.data_df.columns.difference(textfit).tolist()).save()
+            else:
+                sf.to_excel(file).save()
+        else:
+            df.to_excel(file,index=0)
+    except PermissionError:
+        print_log('！！！！！%s被占用，不能覆盖记录！！！！！'%file)
+    return df
+
+def check_time(dlist):
+    '''split and check configure times'''
+    if dlist:
+        if isinstance(dlist,str):
+            if re.search(r'[:：]',dlist):
+                dlist = split_list(r'[:：]',dlist)
+            else:
+                dlist = [dlist]
+        for date in dlist:
+            try:
+                datetime.datetime.strptime(date, '%Y-%m-%d')
+            except ValueError as e:
+                print("Incorrect data format, should be YYYY-MM-DD",e)
+                return 0,None
+        return 1,dlist
+    return 0,None