[toc]
python 表格操作
将awk生成的数据导入表格
pandas :pannel data analysis(面板数据分析)。pandas是基于numpy构建的,为时间序列分析提供了很好的支持。pandas中有两个主要的数据结构,一个是Series,另一个是DataFrame。
Series 类似于一维数组与字典(map)数据结构的结合。它由一组数据和一组与数据相对应的数据标签(索引index)组成。这组数据和索引标签的基础都是一个一维ndarray数组。可将index索引理解为行索引。 Series的表现形式为:索引在左,数据在右。
DataFrame是一个类似表格的数据结构,索引包括列索引和行索引,包含有一组有序的列,每列可以是不同的值类型(数值、字符串、布尔值等)。DataFrame的每一行和每一列都是一个Series,这个Series的name属性为当前的行索引名/列索引名。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import openpyxl as xl
import pandas as pd
import numpy as np
import os
import datetime
cycle=38
time_year_month_day = str(datetime.date.today()-datetime.timedelta(days=cycle))
def proj_xls_to_xlsx(file_path,sheet_name,save_name,tableTitle=None):
with open(file_path) as f:
data = f.readlines()
if not os.path.exists(save_name):
wb = xl.Workbook()
wb.save(save_name)
else:
wb = xl.load_workbook(save_name)
ws1 = wb.create_sheet(0)
ws1.title = sheet_name
if tableTitle != None:
for n in range(len(tableTitle)):
c = n + 1
ws1.cell(row=1, column=c).value = tableTitle[n]
for d in data:
value = d.split('\t')
try:
# print(value)
value[5]=float(value[5])
except:
pass
ws1.append(value)
wb.save(save_name)
def crate_shuju_toushi(save_name,sheet_name,sheet_name2,index_name,values_name,department_name,new_department_name,personnel_name):
f = pd.read_excel(io=save_name,sheet_name=sheet_name)
res = pd.pivot_table(f,index=[index_name],values=[values_name],aggfunc=np.sum, margins=True)
wb = xl.load_workbook(save_name)
old_title = wb.worksheets[0]
old_title.title = sheet_name2
all_xinxi_title=list(zip(list((res.index)),list(res[values_name])))
department_personnel = list(zip(f[department_name],f[index_name]))
department_personnel_size_list = []
for user in all_xinxi_title:
for department_personnel_list in department_personnel:
if user[0] in department_personnel_list:
if user[0] == '-':
continue
if user[0] == ' ':
continue
department_personnel_size_list.append((department_personnel_list[0],user[0],user[1]))
end_department_personnel_size_list = sorted(list(set(department_personnel_size_list)), key=lambda x:x[2] ,reverse=True)
all_xinxi_title_end = all_xinxi_title[-1]
all_xinxi_title_end = list(all_xinxi_title_end)
all_xinxi_title_end.insert(0,'')
end_department_personnel_size_list.append(all_xinxi_title_end)
end_department_personnel_size_list.insert(0,[new_department_name,personnel_name,values_name])
end_department_personnel_size_list.pop()
user_volue = []
for user2 in end_department_personnel_size_list:
user2 = list(user2)
if user2[1] == user_volue:
continue
user_volue = user2[1]
old_title.append(user2)
wb.save(save_name)
def create_space_toushi(save_name,sheet_name,sheet_name2,department_num,sum_name,data_total,index_name,values_name):
f = pd.read_excel(io=save_name,sheet_name=sheet_name,dtype={'业务XX利润编号':str})
res = pd.pivot_table(f,index=index_name,values=values_name,aggfunc=np.sum,margins=True)
all_list = list(zip(list(res.index),list(res[values_name])))
wb = xl.load_workbook(save_name)
sheet = wb.create_sheet(sheet_name2)
space_list = []
for user in all_list:
user2 = list(user)[1]
user3 = list(list(user)[0])
if user3[0] == '-':
space_list.append((user3[1],user2))
if user3[0] == ' ':
space_list.append((user3[1],user2))
sum_list = []
for slist in space_list:
sum_list.append(slist[1])
sum_list=sum(sum_list)
sort_space_list = sorted(list(set(space_list)), key=lambda x:x[1] ,reverse=True)
sort_space_list.append((sum_name, sum_list))
sort_space_list.insert(0,(department_num,data_total))
for all_space_list in sort_space_list:
sheet.append(all_space_list)
wb.save(save_name)
if __name__=='__main__':
sheet_name = 'XXX详情'
save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx'
save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx'
file_path = './able.' + time_year_month_day +'.lib.xls'
proj_xls_to_xlsx(file_path=file_path,
sheet_name=sheet_name,
save_name=save_name)
sheet_name = 'XXX总体情况'
save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx'
file_path = './' + time_year_month_day + '.ProjInfo.XJpath.xls'
table_title = ['业务XX名称','业务XX利润编号',
'运营经理','信息分析','涉及项目数',
'涉及数据量大小(G)','项目编号1',
'项目名称1','数据量(G)1',
'文库编号1','项目编号2',
'项目名称2','数据量(G)2',
'文库编号2','项目编号3',
'项目名称3','数据量(G)3',
'文库编号3','项目编号4',
'项目名称4 数据量(G)4','文库编号4',]
proj_xls_to_xlsx(file_path=file_path,
sheet_name=sheet_name,
tableTitle=table_title,
save_name=save_name)
save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx'
sheet_name = 'XXX总体情况'
sheet_name2 = 'XXX概况'
index_name = '信息分析'
values_name = '涉及数据量大小(G)'
department_name = '业务XX名称'
new_department_name = 'XX'
personnel_name = '人员'
crate_shuju_toushi(save_name=save_name,
sheet_name=sheet_name,
sheet_name2=sheet_name2,
index_name=index_name,
values_name=values_name,
department_name=department_name,
new_department_name=new_department_name,
personnel_name=personnel_name)
index_name = ['信息分析','业务XX利润编号']
save_name = 'TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx'
sheet_name = 'XXX总体情况'
department_num = 'XX编号'
data_total = '数据量(G)'
values_name = '涉及数据量大小(G)'
sheet_name2 = '未匹配人员数据量'
sum_name = 'ALL'
create_space_toushi(index_name=index_name,
save_name=save_name,
sheet_name=sheet_name,
department_num=department_num,
data_total=data_total,
values_name=values_name,
sheet_name2=sheet_name2,
sum_name=sum_name,)
完成数据透视,
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import openpyxl as xl
import datetime
difference_cycle=33
time_year_month_day = str(datetime.date.today()-datetime.timedelta(days=difference_cycle))
def add_last_excel(old_sheet,new_sheet,gaikuang):
pd01 = pd.read_excel(old_sheet,sheet_name=gaikuang,encoding='utf-8')
pd02 = pd.read_excel(new_sheet,sheet_name=gaikuang,encoding='utf-8')
pd11 = pd.read_excel(old_sheet,sheet_name='未匹配人员数据量',encoding='utf-8')
pd12 = pd.read_excel(new_sheet,sheet_name='未匹配人员数据量',encoding='utf-8')
result = pd.merge(pd01,pd02[['人员','涉及数据量大小(G)']],on='人员')
result_list = list(zip(list((result['XXX'])),list((result['人员'])),list((result['涉及数据量大小(G)_x'])),list((result['涉及数据量大小(G)_y']))))
not_match = pd.merge(pd11,pd12[['XXX编号','数据量(G)']],left_on='XXX编号',right_on='XXX编号')
print(not_match)
not_match_list = list(zip(list((not_match['XXX编号'])),list((not_match['数据量(G)_x'])),list((not_match['数据量(G)_y']))))
wb2 = xl.load_workbook(new_sheet)
remove_sheet1 = wb2[gaikuang]
remove_sheet2 = wb2['未匹配人员数据量']
wb2.remove(remove_sheet1)
wb2.remove(remove_sheet2)
wb2.save(new_sheet)
wb2 = xl.load_workbook(new_sheet)
sheet21 = wb2.create_sheet(gaikuang,0)
sheet22 = wb2.create_sheet('未匹配人员数据量')
result_head = ['XXX','人员','涉及数据量(G)','第二次涉及数据量(G)','任务额差']
result_list.insert(0,result_head)
result_for_num = 1
for i in result_list:
result_i = list(i)
if result_for_num != 1:
result_chae=i[2]-i[3]-i[2]*0.2
result_i.append(float('%.2f'% result_chae))
result_for_num = result_for_num + 1
sheet21.append(result_i)
not_match_head = ['XXX编号','数据量(G)','第二次数据量(G)','任务额差']
not_match_list.insert(0,not_match_head)
not_match_for_num = 1
for j in not_match_list:
not_match_j = list(j)
if not_match_for_num != 1:
not_match_chae=j[1]-j[2]-j[1]*0.2
not_match_j.append(float('%.2f'% not_match_chae))
not_match_for_num = not_match_for_num + 1
sheet22.append(not_match_j)
wb2.save(new_sheet)
if __name__ == '__main__':
old_sheet='TJ-XJ-DATASHOW-' + time_year_month_day + '-old.xlsx'
new_sheet='TJ-XJ-DATASHOW-' + time_year_month_day + '.xlsx'
gaikuang='XXX概况'
add_last_excel(old_sheet=old_sheet,new_sheet=new_sheet,gaikuang=gaikuang)