Python实例（持续更新中）

2022-05-19 14:32:25 阅读：146 来源： 互联网

标签：file Python app 更新 df 实例 str import data

一、数据筛选：筛选出某列含有特定值的记录
二、数据处理：某列数据补0
三、数据处理：统计工单派发和质检信息
四、数据处理：四层六域月粒度文件处理
五、数据处理：四量七费日粒度文件转化成月粒度
六、数据处理：计算长期在库业务量TOP5记录
七、数据处理：数据工单生成与附件迁移
八、数据处理：按行分割文件并保留表头

一、数据筛选：筛选出某列含有特定值的记录

1、DPI数据

# app/ip的指标/标签值数据筛选出包含重点城市的记录
import pandas as pd
from pathlib import Path
import sys
import os
import chardet
import datetime
from dateutil.relativedelta import relativedelta

def turn(file):
    with open(file, 'rb') as f:
        data = f.read()
        encoding = chardet.detect(data)['encoding']
        data_str = data.decode(encoding)
        tp = 'LF'
        if '\r\n' in data_str:
            tp = 'CRLF'
            data_str = data_str.replace('\r\n', '\n')
        if encoding not in ['utf-8', 'ascii'] or tp == 'CRLF':
            with open(file, 'w', newline='\n', encoding='utf-8') as f:
                f.write(data_str)
            print(f"{file}: ({tp},{encoding}) trun to (LF,utf-8) success!")


citys = ["北京市","广州市","上海市","天津市","重庆市","沈阳市","南京市","武汉市",
        "成都市","西安市","石家庄市","太原市","郑州市","长春市","哈尔滨市","呼和浩特市","济南市","合肥市","杭州市",
        "福州市","长沙市","南宁市","南昌市","贵阳市","昆明市","拉萨市","海口市","兰州市","银川市","西宁市","乌鲁木齐市",
        "深圳市","苏州市","东莞市","宁波市","青岛市","温州市","佛山市","无锡市","金华市","泉州市","大连市","厦门市","台州市"]
address = '|'.join(citys)
day6 = (datetime.datetime.now() + relativedelta(days=-6)).strftime("%Y%m%d")
print(day6)
for appType in ['bad_app_mark','bad_app']:
    intputApp = '/data/mytest/indicator/collection/dpi/sysk_test/' + day6 + '/'+appType
    for p in Path(intputApp).iterdir():
        for s in p.rglob('*.csv'):
            # print(s)
            df = pd.read_csv(s,header=None,index_col=False,names = ['date','prov','city','big','small','a','b','c','d','e','f','g','h','i','j','k','l','m'],sep = '|')
            df_new = df[df['city'].str.contains(address)]
            df_new.to_csv(s,index=0,header=0,sep='|')
            turn(s)
print("app done")

for ipType in ['bad_ip_mark','bad_ip']:
    intputIp = '/data/mytest/indicator/collection/dpi/sysk_test/' + day6 + '/'+ipType
    for p in Path(intputIp).iterdir():
        for s in p.rglob('*.csv'):
            # print(s)
            df = pd.read_csv(s,header=None,index_col=False,names = ['date','prov','city','big','small','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o'],sep = '|')
            df_new = df[df['city'].str.contains(address)]
            df_new.to_csv(s,index=0,header=0,sep='|')
            turn(s)
print("done")

2、中间表数据

# 重新计算业务库出入库：中间表和出入库表筛选出只含京津沪的记录

import pandas as pd
from functools import reduce
import chardet

def turn(file):
    with open(file, 'rb') as f:
        data = f.read()
        encoding = chardet.detect(data)['encoding']
        data_str = data.decode(encoding)
        tp = 'LF'
        if '\r\n' in data_str:
            tp = 'CRLF'
            data_str = data_str.replace('\r\n', '\n')
        if encoding not in ['utf-8', 'ascii'] or tp == 'CRLF':
            with open(file, 'w', newline='\n', encoding='utf-8') as f:
                f.write(data_str)
            print(f"{file}: ({tp},{encoding}) trun to (LF,utf-8) success!")


pd.set_option('expand_frame_repr', False)

middle = "D:\\cmdi\\sysk\\data_filter\\20220330\\intermediate_data\\partition=质差业务库\\part-00000-317d2028-fe98-423e-a639-dbc1853b0807.c000.txt"
middle_out = "C:\\Users\\Dell\\Desktop\\20220330_new\\middle\\part-00000-317d2028-fe98-423e-a639-dbc1853b0807.c000.txt"

middle_app = "D:\\cmdi\\sysk\\data_filter\\20220330\\intermediate_data_business\\app\\part-00000-8c683085-b6dd-42f9-8073-8cf9f9387852.c000.txt"
middle_out_app = "C:\\Users\\Dell\\Desktop\\20220330_new\\middle_app\\part-00000-8c683085-b6dd-42f9-8073-8cf9f9387852.c000.txt"

middle_ip = "D:\\cmdi\\sysk\\data_filter\\20220330\\intermediate_data_business\\ip\\part-00000-8c683085-b6dd-42f9-8073-8cf9f9387852.c000.txt"
middle_out_ip = "C:\\Users\\Dell\\Desktop\\20220330_new\\middle_ip\\part-00000-8c683085-b6dd-42f9-8073-8cf9f9387852.c000.txt"

outin = "D:\\cmdi\\sysk\\data_filter\\20220330\\outin_data\\partition=质差业务库\\part-00000-9c9cdf0f-cf95-4703-ad54-6f0b7ffe667e.c000.txt"
outin_out = "C:\\Users\\Dell\\Desktop\\20220330_new\\outin\\part-00000-9c9cdf0f-cf95-4703-ad54-6f0b7ffe667e.c000.txt"

outin_app = "D:\\cmdi\\sysk\\data_filter\\20220330\\outin_data_business\\app\\part-00000-b50d8aa8-b04d-4da5-ae34-f8cf9d2cb760.c000.txt"
outin_out_app = "C:\\Users\\Dell\\Desktop\\20220330_new\\outin_app\\part-00000-b50d8aa8-b04d-4da5-ae34-f8cf9d2cb760.c000.txt"

outin_ip = "D:\\cmdi\\sysk\\data_filter\\20220330\\outin_data_business\\ip\\part-00000-b50d8aa8-b04d-4da5-ae34-f8cf9d2cb760.c000.txt"
outin_out_ip = "C:\\Users\\Dell\\Desktop\\20220330_new\\outin_ip\\part-00000-b50d8aa8-b04d-4da5-ae34-f8cf9d2cb760.c000.txt"





# OUTINTYPE|KEY|省份|城市|BUSINESSTYPE|入库时间|最新入库时间|出库时间|是否在库|是否长期在库|是否频繁出入库|星级|ARPU|指标
df = pd.read_csv(middle_ip, header=None, index_col=False, sep="|", names=["type","key","prov","city","business","inTime","newinTime","outTime","in","longIn","frequentIn","star","arpu","indicator"])

# OUTINTYPE|KEY|ADDR|TYPE|IPADDR|PORT|省份|城市|STAR|ARPU|BUSINESSTYPE|入库时间|最新入库时间|出库时间|是否在库|是否长期在库|是否频繁出入库|指标
# df = pd.read_csv(outin, header=None, index_col=False, sep="|", names=["outintype","key","addr","type","ip","port","prov","city","star","arpu","businessType","inTime","newinTime","outTime","in","longIn","frequentIn","indicator"])
s = middle_out_ip


shDf = df[df['prov'] == '上海']
bjDf = df[df['prov'] == '北京']
tjDf = df[df['prov'] == '天津']
print(len(shDf))
print(len(bjDf))
print(len(tjDf))
# dfs = [df1, df2, df3]

sc = pd.concat([shDf, bjDf], ignore_index=True)
resDf = pd.concat([sc, tjDf],ignore_index=True)


resDf.to_csv(s, index=0, header=0, sep='|')
turn(s)


print(len(resDf))
print(resDf.head())
print("done")

二、数据处理：某列数据补0

1、中间表数据

# 中间表业务大类补0

import pandas as pd
import numpy as np
from pathlib import Path
import sys
import os
import chardet
def turn(file):
    with open(file, 'rb') as f:
        data = f.read()
        encoding = chardet.detect(data)['encoding']
        data_str = data.decode(encoding)
        tp = 'LF'
        if '\r\n' in data_str:
            tp = 'CRLF'
            data_str = data_str.replace('\r\n', '\n')
        if encoding not in ['utf-8', 'ascii'] or tp == 'CRLF':
            with open(file, 'w', newline='\n', encoding='utf-8') as f:
                f.write(data_str)
            print(f"{file}: ({tp},{encoding}) trun to (LF,utf-8) success!")

pd.set_option('expand_frame_repr', False)

# inputPath = "C:\\Users\\Dell\\Desktop\\intermediate_data\\20220225\\partition=质差业务库\\part-00000-c51b5c05-6be8-4996-917f-8b25de278967.c000.txt"
# inputPath = "C:\\Users\\Dell\\Desktop\\intermediate_data_business\\20220225\\partition=移动上网-BAD_APP\\part-00000-911a4850-4699-44ea-a4f4-5d6697582c13.c000.txt"
inputPath = "C:\\Users\\Dell\\Desktop\\intermediate_data_business\\20220225\\partition=移动上网-BAD_IP\\part-00000-911a4850-4699-44ea-a4f4-5d6697582c13.c000.txt"

df = pd.read_csv(inputPath,header=None,index_col=False,names = ['business','app','prov','city','type','a','b','c','d','e','f','g','h','i'],sep = '|')
# 拆分
business_name = ['one','two','three','four','five','six']
business_col = df['app'].str.split('_', expand=True)
business_col.columns = business_name
df = df.join(business_col)

# 补0
df['six'] = df['six'].str.zfill(5)

#合并
df['app'] = "_"+df['two'].map(str)+"_"+df['three'].map(str)+"__"+df['five'].map(str)+"_"+df['six'].map(str)
#还原
df.drop('one', axis=1, inplace=True)
df.drop('two', axis=1, inplace=True)
df.drop('three', axis=1, inplace=True)
df.drop('four', axis=1, inplace=True)
df.drop('five', axis=1, inplace=True)
df.drop('six', axis=1, inplace=True)

print(df.head())


outputPath = "./output/ip_part-00000-911a4850-4699-44ea-a4f4-5d6697582c13.c000.txt"
df.to_csv(outputPath,index=0,header=0,sep='|')
turn(outputPath)

2、DPI数据

# app/ip数据业务小类补0

import pandas as pd
from pathlib import Path
import sys
import os
import chardet
def turn(file):
    with open(file, 'rb') as f:
        data = f.read()
        encoding = chardet.detect(data)['encoding']
        data_str = data.decode(encoding)
        tp = 'LF'
        if '\r\n' in data_str:
            tp = 'CRLF'
            data_str = data_str.replace('\r\n', '\n')
        if encoding not in ['utf-8', 'ascii'] or tp == 'CRLF':
            with open(file, 'w', newline='\n', encoding='utf-8') as f:
                f.write(data_str)
            print(f"{file}: ({tp},{encoding}) trun to (LF,utf-8) success!")


day6='20220401'
for appType in ['bad_app_mark','bad_app']:
    intputApp = '/data/mytest/indicator/collection/dpi/sysk_test/' + day6 + '/'+appType
    for p in Path(intputApp).iterdir():
        for s in p.rglob('*.csv'):
            # print(s)
            df = pd.read_csv(s,header=None,index_col=False,names = ['date','prov','city','big','small','a','b','c','d','e','f','g','h','i','j','k','l','m'],sep = '|')
            df['small'] = df['small'].astype('str')
            df['small'] = df['small'].str.zfill(5)
            # print(df)
            df.to_csv(s,index=0,header=0,sep='|')
            turn(s)

for ipType in ['bad_ip_mark','bad_ip']:
    intputIp = '/data/mytest/indicator/collection/dpi/sysk_test/' + day6 + '/'+ipType
    for p in Path(intputIp).iterdir():
        for s in p.rglob('*.csv'):
            # print(s)
            df = pd.read_csv(s,header=None,index_col=False,names = ['date','prov','city','big','small','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o'],sep = '|')
            df['small'] = df['small'].astype('str')
            df['small'] = df['small'].str.zfill(5)
            # print(df)
            df.to_csv(s,index=0,header=0,sep='|')
            turn(s)

三、数据处理：统计工单派发和质检信息

# 统计工单派发和质检情况
from pathlib import Path
import pandas as pd
import json
import time
from pandas.core.frame import DataFrame

prov_dict = {551: '安徽', 100: '北京', 230: '重庆', 591: '福建', 200: '广东', 931: '甘肃', 771: '广西',
             851: '贵州', 371: '河南', 270: '湖北', 311: '河北', 898: '海南', 451: '黑龙江', 731: '湖南',
             431: '吉林', 250: '江苏', 791: '江西', 240: '辽宁', 471: '内蒙古', 951: '宁夏', 971: '青海',
             280: '四川', 531: '山东', 210: '上海', 290: '陕西', 351: '山西', 220: '天津', 991: '新疆',
             891: '西藏', 871: '云南', 571: '浙江'}
type_dict = {0o1: '用户', 0o2: '业务', 0o3: '位置', 0o4: '网元'}
# 获取工单派发相关信息
order = []

for p in Path('/home/liuge/laizhengyang/calc_inspect/attach').iterdir():
    if(p.name.startswith("附件1") & p.name.endswith("T1.csv")):
        qdWsid = p.name.split("_")[1]
        s = p.name.split("_")[1].split("-")
        del s[3]
        s.insert(0, qdWsid)
        order.append(s)

orderDF = DataFrame(order, columns=['qdWsid','date','type','prov'])
orderDF['type'] = orderDF['type'].apply(pd.to_numeric)
orderDF['prov'] = orderDF['prov'].apply(pd.to_numeric)
orderDF['type'] = orderDF['type'].map(lambda x: type_dict[x])
orderDF['prov'] = orderDF['prov'].map(lambda x: prov_dict[x])

#获取工单质检相关信息
inspect = []
for p in Path('/home/liuge/laizhengyang/calc_inspect/inspect').iterdir():
    # 载入文件
    with open(p, 'r', encoding='utf-8', errors='ignore') as f:
        rows = json.load(f)
        length = len(rows)
        info = []
        if length == 1:
            qdWsid = rows[0]['qdWsid']
            second = '空'
            if rows[0].__contains__('issueSolvedSecond'):
                second = rows[0]['issueSolvedSecond']
                if len(second) == 0:
                    second = '空'
            first = rows[0]['issueSolvedFirst']
            info=[qdWsid, first, second]
            inspect.append(info)
        else:
            for i in range(0, length):
                qdWsid = rows[i]['qdWsid']
                second = '空'
                if rows[i].__contains__('issueSolvedSecond'):
                    second = rows[i]['issueSolvedSecond']
                    if len(second) == 0:
                        second = '空'
                first = rows[i]['issueSolvedFirst']
                info = [qdWsid, first, second]
                inspect.append(info)
print(inspect)
inspectDF = DataFrame(inspect, columns=['qdWsid', '第一次质检', '第二次质检'])
inspectDF.loc[inspectDF['第一次质检'] == '是', '第二次质检'] = '归档'

# 清除重复情况
# 空后面必然跟着第二次质检，因此这是一种重复情况？此处逻辑有问题，把第一次质检完成尚未完成第二次质检的排除了
inspectDF.drop(inspectDF.index[(inspectDF['第二次质检'] == '空')], inplace=True)
inspectDF = inspectDF.drop_duplicates(['qdWsid'])
# 左联结
outputDF = pd.merge(orderDF,inspectDF,how="left")
today =  time.strftime("%Y%m%d", time.localtime())
writer = pd.ExcelWriter('/home/liuge/laizhengyang/calc_inspect/{}.xlsx'.format(today),engine='openpyxl')
outputDF.to_excel(writer, sheet_name='info',index=False)
orderDF.to_excel(writer, sheet_name='order',index=False)
inspectDF.to_excel(writer, sheet_name='inspect',index=False)
writer.save()
print('Done')

四、数据处理：四层六域月粒度文件处理

# 四层六域月粒度数据修改:文件名修改.列名由中文改为英文,增加两列

import pandas as pd
import chardet
import datetime
from dateutil.relativedelta import relativedelta

def turn(file):
    with open(file, 'rb') as f:
        data = f.read()
        encoding = chardet.detect(data)['encoding']
        data_str = data.decode(encoding)
        tp = 'LF'
        if '\r\n' in data_str:
            tp = 'CRLF'
            data_str = data_str.replace('\r\n', '\n')
        if encoding not in ['utf-8', 'ascii'] or tp == 'CRLF':
            with open(file, 'w', newline='\n', encoding='utf-8') as f:
                f.write(data_str)
            print(f"{file}: ({tp},{encoding}) trun to (LF,utf-8) success!")


pd.set_option('expand_frame_repr', False)

scly = {'省份': 'PROVINCE', '城市': 'CITY', '日期': 'DATE_TIME', '日期类型': 'DATE_TYPE',
        'VoLTE全程呼叫成功率': 'B05D03S004I01100',
        'VoLTE语音呼叫建立时延': 'B05D03S109I00300',
        'VoLTE语音质差通话占比': 'B05D03S004I01200',
        '5G语音回落接通率': 'B05D03S004I01300',
        '5G语音回落接通时延': 'B05D03S004I01400',
        '支付响应成功率': 'B05D03S116I00100',
        '支付业务响应总时延': 'B05D03S116I00300',
        '即时通信响应总时延': 'B05D03S136I00500',
        '即时通信消息发送成功率': 'B05D03S136I00300',
        '即时通信消息接收成功率': 'B05D03S136I00400',
        '视频播放成功率': 'B05D03S107I00400',
        '视频播放等待时长': 'B05D03S107I00500',
        '视频播放卡顿时长占比': 'B05D03S107I00600',
        '视频播放平均卡顿次数': 'B05D03S028I00100',
        '视频播放流畅度': 'B05D03S028I00200',
        '视频业务响应成功率': 'B18D13S005I00300',
        '视频业务响应总时延': 'B18D13S005I00900',
        '视频上行RTT时延': 'B18D13S005I00500',
        '视频下行RTT时延': 'B18D13S005I00600',
        '网页浏览成功率': 'B05D03S107I00200',
        '网页打开时长': 'B05D03S107I00300',
        '游戏响应成功率': 'B05D03S005I00700',
        '游戏响应总时延': 'B05D03S036I00300',
        '游戏加载时延': 'B05D03S036I00200',
        '游戏上行RTT时延': 'B18D13S005I00700',
        '游戏下行RTT时延': 'B18D13S005I00800',
        '5G消息受理成功率': 'B05D03S135I00300',
        '5G消息下发成功率': 'B05D03S135I00400',
        '家宽装机及时率': 'B06D04S031I00300',
        '家宽投诉处理及时率': 'B06D04S006I00200',
        'VoLTE语音网络接通率': 'B05D03S004I02000',
        'TCP上行重传率': 'B05D03S005I03300',
        'TCP下行重传率': 'B05D03S005I03400',
        'TCP上行乱序率': 'B05D03S005I03500',
        'TCP下行乱序率': 'B05D03S005I03600',
        'HTTP响应成功率': 'B05D03S005I02400',
        'HTTP响应时延': 'B05D03S005I02500',
        'SA排除用户原因的初始注册成功率': 'B05D03S005I02900',
        'AMF业务请求成功率': 'B05D03S082I00200',
        'ToBAMF用户鉴权成功率': 'B05D03S082I00500',
        'PDU会话建立成功率': 'B05D03S082I00300',
        '5G寻呼成功率': 'B05D03S082I00400',
        '5G流量分流比': 'B18D03S086I00100',
        }


# inputPath = "C:\\Users\\Dell\\Desktop\\ori\\scly-m-202201-001.csv"
month = datetime.date.strftime(datetime.date.today() - relativedelta(months=1), '%Y%m')
inputPath = '/data/mytest/indicator/collection/jzxn/scly-slqf/scly-m-'+month+'-001.csv'
outputPath = '/data/mytest/indicator/collection/ods_slsw/scly/slsw_scly_m_'+month+'.csv'


df = pd.read_csv(inputPath,index_col=False,sep = '|')
# 将中文表头转化为英文
df = df.rename(columns=scly)
# 添加attribute列
df.insert(4, 'ATTRIBUTE1', '汇总')
df.insert(5, 'ATTRIBUTE2', '汇总')
df.to_csv(outputPath,index=0,sep='|')
turn(outputPath)

五、数据处理：四量七费日粒度文件转化成月粒度

# 四量七费月粒度数据修改:日粒度文件转化成月粒度,涉及异常值处理,groupby使用

#!/usr/local/bin/python
# -*- coding:utf-8 -*-
import pandas as pd
from pathlib import Path
import chardet
import datetime
from dateutil.relativedelta import relativedelta

def turn(file):
    with open(file, 'rb') as f:
        data = f.read()
        encoding = chardet.detect(data)['encoding']
        data_str = data.decode(encoding)
        tp = 'LF'
        if '\r\n' in data_str:
            tp = 'CRLF'
            data_str = data_str.replace('\r\n', '\n')
        if encoding not in ['utf-8', 'ascii'] or tp == 'CRLF':
            with open(file, 'w', newline='\n', encoding='utf-8') as f:
                f.write(data_str)
            print(f"{file}: ({tp},{encoding}) trun to (LF,utf-8) success!")


indicator = ['B18D17S086I00180', 'B18D17S086I00280', 'B18D17S086I04080', 'B18D17S086I03280', 'B13D17S086I02480',
             'B13D17S086I02980', 'B18D17S086I00680', 'B18D17S086I00780', 'B18D17S086I04180', 'B18D17S086I00980',
             'B18D17S086I01080', 'B18D17S086I03380', 'B18D17S086I03480', 'B18D17S088I01100', 'B18D17S088I01200']

# 'B18D17S086I00380', 'B18D17S086I00880'

pd.set_option('expand_frame_repr', False)

# month = datetime.date.strftime(datetime.date.today() - relativedelta(months=1), '%Y%m')
month = '202202'

df = pd.DataFrame()
# for p in Path('/data/mytest/indicator/collection/jzxn/scly-slqf').iterdir():
for p in Path("C:\\Users\\Dell\\Desktop\\需求梳理\\ori\\slqf").iterdir():
    if p.name.__contains__(month) & p.name.__contains__("slqf"):
        inputPath = p
        df_tmp = pd.read_csv(inputPath, index_col=False, sep='|')
        sc = df_tmp.drop(df_tmp.index[[0, 1]])
        df = df.append(sc)
# df.to_csv('/data/mytest/indicator/collection/ods_slsw/slqf_collect/slqf_'+month+'_combine.csv', index=0, sep='|')
# df.to_csv("C:\\Users\\Dell\\Desktop\\original_data_"+month+".csv", index=0, sep='|')


df[indicator] = df[indicator].astype(float)
df = df.reset_index(drop=True)


B18D17S086I00180 = (df['B18D17S086I00180'].groupby(df['CITY']).mean()/1024/1024).round(6)

decideTime = df[df["B18D17S086I00280"] == df["B18D17S086I00280"].max()]['DATE_TIME']
timeStr = decideTime.values[0]
B18D17S086I00280 = df[df.DATE_TIME == timeStr][['CITY', 'B18D17S086I00280']]
B18D17S086I00280 = (pd.DataFrame(B18D17S086I00280).set_index('CITY')/1024/1024).round(6)

B18D17S086I03280 = (df['B18D17S086I03280'].groupby(df['CITY']).mean()/1024).round(6)
B13D17S086I02480 = df['B13D17S086I02480'].groupby(df['CITY']).mean().round(6)
B13D17S086I02980 = df['B13D17S086I02980'].groupby(df['CITY']).mean().round(6)
B18D17S086I00680 = (df['B18D17S086I00680'].groupby(df['CITY']).mean()/10000).round(6)
# B18D17S086I00780 = (df['B18D17S086I00780'].groupby(df['CITY']).max()/10000).round(6)

accDf = df.sort_values(by = 'B18D17S086I00780', ascending = True)
accDf = accDf.reset_index(drop=True)
accLen = len(accDf)
while accLen > 0:
    maxVal = accDf['B18D17S086I00780'][accLen-1]
    secondMaxVal = accDf['B18D17S086I00780'][accLen-2]
    ratio = (maxVal-secondMaxVal)/maxVal
    if ratio < 0.05:
        break
    accLen = accLen-1
decideTime780 = accDf[accDf['B18D17S086I00780'] == accDf['B18D17S086I00780'][accLen-1]]['DATE_TIME']
# decideTime780 = df[df["B18D17S086I00780"] == df["B18D17S086I00780"].max()]['DATE_TIME']
timeStr780 = decideTime780.values[0]
B18D17S086I00780 = df[df.DATE_TIME == timeStr780][['CITY', 'B18D17S086I00780']]
B18D17S086I00780 = (pd.DataFrame(B18D17S086I00780).set_index('CITY')/10000).round(6)



B18D17S086I00980 = (df['B18D17S086I00980'].groupby(df['CITY']).mean()/10000).round(6)
B18D17S086I01080 = (df['B18D17S086I01080'].groupby(df['CITY']).mean()/10000).round(6)
B18D17S086I03380 = df['B18D17S086I03380'].groupby(df['CITY']).mean().round(6)
B18D17S086I03480 = df['B18D17S086I03480'].groupby(df['CITY']).mean().round(6)
B18D17S088I01100 = (df['B18D17S088I01100'].groupby(df['CITY']).mean()/10000).round(6)
B18D17S088I01200 = (df['B18D17S088I01200'].groupby(df['CITY']).mean()/10000).round(6)
# B18D17S086I00380 = df['B18D17S086I00380'].groupby(df['CITY']).mean().round(6)
# B18D17S086I00880 = (df['B18D17S086I00880'].groupby(df['CITY']).mean()/10000).round(6)


sc = pd.concat([B18D17S086I00180, B18D17S086I00280, B18D17S086I03280, B13D17S086I02480,
                B13D17S086I02980, B18D17S086I00680, B18D17S086I00780, B18D17S086I00980,
                B18D17S086I01080, B18D17S086I03380, B18D17S086I03480, B18D17S088I01100,
                B18D17S088I01200], axis=1, sort=False)
# , B18D17S086I00380, B18D17S086I00880

# 重建索引
sc = sc.reset_index(drop=False)
# 新增province一列，如果city为地市，省份应该怎么办？
# print(sc)
sc['PROVINCE'] = sc['CITY']
sc.insert(0, 'PROVINCE', sc.pop('PROVINCE'))
# 新增data_time一列
sc.insert(2, 'DATE_TIME', month)
# 新增data_type一列
sc.insert(3, 'DATE_TYPE', '月')
# 新增attribute1和attribute2
sc.insert(4, 'ATTRIBUTE1', '汇总')
sc.insert(5, 'ATTRIBUTE2', '汇总')

# outputPath = '/data/mytest/indicator/collection/ods_slsw/slqf/slsw_slqf_m_'+month+'_001_001.csv'
outputPath = 'C:\\Users\\Dell\\Desktop\\slsw_slqf_m_'+month+'_001_001.csv'
sc.to_csv(outputPath, index=0, sep='|')
turn(outputPath)
print("done")

六、数据处理：计算长期在库业务量TOP5记录

DPI数据+中间表数据

# 质差APP、IP、POI筛选长期在库业务量TOP5
import pandas as pd
import os
import numpy as np
import datetime
from dateutil.relativedelta import relativedelta

month = datetime.date.strftime(datetime.date.today() - relativedelta(months=1), '%Y%m')   #自动获取上个月月份，YYYYMM格式
#month = 202110
sysk_date = str(month)[4:] + '30'
#sysk_date = '1030'

dpi_path = 'D:\\TOP5\\DPI'
sysk_path = 'D:\\TOP5\\四域四库'

def SearchFiles(path, fileType):
    fileList=[]
#root：绝对路径，dirs：文件夹名称，files：文件名。os.walk的默认3个参数
    for root, dirs, files in os.walk(path):
        for fileName in files:
            if fileName.endswith(fileType):
                fileList.append(os.path.join(root,fileName))
    return fileList

# prov_dict = {551: '安徽', 100: '北京', 230: '重庆', 591: '福建', 200: '广东', 931: '甘肃', 771: '广西',
#              851: '贵州', 371: '河南', 270: '湖北', 311: '河北', 898: '海南', 451: '黑龙江', 731: '湖南',
#              431: '吉林', 250: '江苏', 791: '江西', 240: '辽宁', 471: '内蒙古', 951: '宁夏', 971: '青海',
#              280: '四川', 531: '山东', 210: '上海', 290: '陕西', 351: '山西', 220: '天津', 991: '新疆',
#              891: '西藏', 871: '云南', 571: '浙江'}




#dpi app数据-----------------------------------------------------------------------------------------
app = pd.read_csv(open('{}\\{}\\质差APP-{}.log'.format(dpi_path,month,month),encoding='UTF8',errors='ignore')
                  ,sep='|',usecols=[1,3,4,5,16,17],header=None,low_memory=False,dtype=object)   #强制字符串类型，保证不丢0
# 读取了省份、业务大类、业务小类、用户数
app.columns=['prov','type','subtype','cnt','大类名称','小类名称']
app['cnt'] = app['cnt'].apply(pd.to_numeric)  #cnt转数值
app['app_type'] = app['type'] + '_' + app['subtype']   #两列拼接
app_cnt = app.groupby(by = ['prov','app_type','大类名称','小类名称']).aggregate({'cnt':np.sum})
app_cnt.reset_index(inplace=True)


sysk_path_app = '{}\\四域四库_{}\\app'.format(sysk_path, sysk_date)
fileType ='.txt'
fList = SearchFiles(sysk_path_app, fileType)
for file in fList:
    pd0=pd.read_csv(open(file,encoding='UTF8',errors='ignore'),sep='|',header=None,usecols=[1,6,15],low_memory=False,dtype=object)
    if file==fList[0]:
        sysk_app=pd0
    else:
        sysk_app=pd.concat([sysk_app, pd0],ignore_index=True)
# 读取了业务大小类编码、省份、是否长期在库
sysk_app.columns=['userid','prov','is_long']
sysk_app_long = sysk_app[sysk_app['is_long'] == '是']


# 汇聚
app_output = pd.merge(sysk_app_long,app_cnt,left_on=['prov','userid'],right_on=['prov','app_type'], how='left')
app_output.drop(['userid','is_long'], axis=1,inplace=True)

app_output.drop_duplicates(inplace=True)
app_output['rank'] = app_output.groupby(['prov'])['cnt'].rank(method='min',ascending =0)  #分组排名
app_output.sort_values(['prov','rank'],ascending=[1,1],inplace=True)
app_output = app_output[app_output['rank']<=5]
app_order = ['prov','app_type','cnt','大类名称','小类名称','rank']
app_output = app_output[app_order]


#dpi ip数据--------------------------------------------------------------------------------------------
ip = pd.read_csv(open('{}\\{}\\质差IP-{}.log'.format(dpi_path,month,month),encoding='UTF8',errors='ignore')
                  ,sep='|',usecols=[1,3,4,5,6,17,18,19],header=None,low_memory=False,dtype=object)   #强制字符串类型，保证不丢0
# 读取了省份、业务大类、业务小类、IP、用户数
ip.columns=['prov','type','subtype','ip','cnt','大类名称','小类名称','归属地']
ip['cnt'] = ip['cnt'].apply(pd.to_numeric)  #cnt转数值
ip['app_type'] = ip['type'] + '_' + ip['subtype']   #两列拼接
ip_cnt = ip.groupby(by = ['prov','app_type','ip','归属地','大类名称','小类名称']).aggregate({'cnt':np.sum})
ip_cnt.reset_index(inplace=True)


sysk_path_ip = '{}\\四域四库_{}\\ip'.format(sysk_path,sysk_date)
fileType ='.txt'
fList = SearchFiles(sysk_path_ip, fileType)
#print(fList)
for file in fList:
    pd0=pd.read_csv(open(file,encoding='UTF8',errors='ignore'),sep='|',header=None,usecols=[1,4,6,15],low_memory=False,dtype=object)
    if file==fList[0]:
        sysk_ip=pd0
    else:
        sysk_ip=pd.concat([sysk_ip, pd0],ignore_index=True)
# 读取了业务大小类编码、IP、省份、是否长期在库
sysk_ip.columns=['userid','ip','prov','is_long']
sysk_ip_long = sysk_ip[sysk_ip['is_long'] == '是']

# 汇聚
ip_output = pd.merge(sysk_ip_long,ip_cnt,left_on=['prov','userid','ip'],right_on=['prov','app_type','ip'], how='left')
ip_output.drop(['userid','is_long'], axis=1,inplace=True)

ip_output.drop_duplicates(inplace=True)
ip_output['rank'] = ip_output.groupby(['prov'])['cnt'].rank(method='min',ascending =0)  #分组排名
ip_output.sort_values(['prov','rank'],ascending=[1,1],inplace=True)
ip_output = ip_output[ip_output['rank']<=5]
order = ['prov','app_type','ip','cnt','归属地','大类名称','小类名称','rank']
ip_output = ip_output[order]


#dpi poi数据------------------------------------------------------------------------------------------
poi = pd.read_csv(open('{}\\{}\\质差POI-{}.log'.format(dpi_path,month,month),encoding='UTF8',errors='ignore')
                  ,sep='|',usecols=[1,3,4,6],header=None,low_memory=False,dtype=object)   #强制字符串类型，保证不丢0
poi.columns=['prov','longitude','latitude','cnt']
poi['cnt'] = poi['cnt'].apply(pd.to_numeric)  #cnt转数值
poi['poi'] = poi['longitude'] + '_' + poi['latitude']   #两列拼接
poi_cnt = poi.groupby(by = ['prov','poi']).aggregate({'cnt':np.sum})
poi_cnt.reset_index(inplace=True)

sysk_path_poi = '{}\\四域四库_{}\\poi'.format(sysk_path,sysk_date)
fileType ='.txt'
fList = SearchFiles(sysk_path_poi, fileType)
#print(fList)
for file in fList:
    pd0=pd.read_csv(open(file,encoding='UTF8',errors='ignore'),sep='|',header=None,usecols=[1,6,15],low_memory=False,dtype=object)
    if file==fList[0]:
        sysk_poi=pd0
    else:
        sysk_poi=pd.concat([sysk_poi, pd0],ignore_index=True)
sysk_poi.columns=['userid','prov','is_long']
sysk_poi_long = sysk_poi[sysk_poi['is_long'] == '是']

# 汇聚
poi_output = pd.merge(sysk_poi_long,poi_cnt,left_on=['prov','userid'],right_on=['prov','poi'], how='left')
poi_output.drop(['userid','is_long'], axis=1,inplace=True)
poi_output['rank'] = poi_output.groupby(['prov'])['cnt'].rank(method='min',ascending =0)  #分组排名
poi_output.sort_values(['prov','rank'],ascending=[1,1],inplace=True)
poi_output = poi_output[poi_output['rank']<=5]
#print(poi_output.head(20))

writer = pd.ExcelWriter('{}\\{}\\质差top5_{}.xlsx'.format(dpi_path,month,month),engine='openpyxl')
app_output.to_excel(writer, sheet_name='app',index=False)
ip_output.to_excel(writer, sheet_name='ip',index=False)
poi_output.to_excel(writer, sheet_name='poi',index=False)
writer.save()
print('Done')

七、数据处理：数据工单生成与附件迁移

# 附件1/2重命名并迁移至attachment路径,生成工单


import datetime
import json
import time
import os, shutil, sys

# 获取当前日期
# file_today = datetime.date.strftime(datetime.date.today(), '%Y%m%d')
# file_min = time.strftime("%Y%m%d%H%M", time.localtime())
# file_sec = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
file_today = "20220519"
file_min = "202205191030"
file_sec = "2022-05-19 10:30:00"

data_month = '202204'
# 获取省份映射
prov_dict = {"安徽": 551, "北京": 100, "重庆": 230, "福建": 591, "广东": 200, "甘肃": 931, "广西": 771,
             "贵州": 851, "河南": 371, "湖北": 270,"河北": 311, "海南": 898, "黑龙江": 451, "湖南": 731,
             "吉林": 431, "江苏": 250, "江西": 791, "辽宁": 240, "内蒙古": 471,"宁夏": 951, "青海": 971,
             "四川": 280, "山东": 531, "上海": 210, "陕西": 290, "山西": 351, "天津": 220, "新疆": 991,
             "西藏": 891, "云南": 871, "浙江": 571}
prov_code_dict = {"广东": "GD", "北京": "BJ", "上海": "SH", "天津": "TJ", "江苏": "JS", "浙江": "ZJ",
                  "安徽": "AH", "福建": "FJ", "湖北": "HB", "陕西": "SN", "河北": "HE", "山西": "SX",
                  "河南": "HA", "吉林": "JL", "湖南": "HN", "广西": "GX", "江西": "JX", "云南": "YN",
                  "海南": "HI", "甘肃": "GS", "青海": "QH", "重庆": "CQ", "辽宁": "LN", "四川": "SC",
                  "山东": "SD", "贵州": "GZ", "西藏": "XZ", "宁夏": "NX", "新疆": "XJ", "黑龙江": "HL",
                  "内蒙古": "NM"}


# 附件处理逻辑
# 文件迁移
def move_to_root_folder(root_path, cur_path):
    for filename in os.listdir(cur_path):
        if os.path.isfile(os.path.join(cur_path, filename)):
            shutil.move(os.path.join(cur_path, filename), os.path.join(root_path, filename))
        elif os.path.isdir(os.path.join(cur_path, filename)):
            move_to_root_folder(root_path, os.path.join(cur_path, filename))
        else:
            sys.exit("Should never reach here.")

    # remove empty folders
    if cur_path != root_path:
        os.rmdir(cur_path)

# 生成附件1
key_word = 'no5g_eci_month'
ori_path = 'C:\\Users\\Dell\\Desktop\\'+key_word+'\\'+data_month+'\\userList\\'
#  列出省份目录
prov_path = os.listdir(ori_path)
for prov in prov_path:
    if(prov.__contains__("province")):
        provName = prov[9:]
        provCode = prov_dict[provName]
        desPath = ori_path + '\\' + prov  # 获取旧的文件路径和名称
        oldFile = os.listdir(desPath)[0]
        oldName = os.path.join(desPath + "\\"+oldFile)
        # 此处需要增加文件大小判断，并自动拆分成多个文件，文件名指定从0001增长到000n。。。
        newFile = "附件1_"+file_today+"-05-"+str(provCode)+"-0001_"+key_word+".csv"
        newName = os.path.join(desPath + "\\" + newFile)  # 修改之后文件名
        os.rename(oldName, newName)

# 生成附件2
key_word2 = 'indoor_quality'
ori_path2 = 'C:\\Users\\Dell\\Desktop\\'+key_word2+'\\'+data_month+'\\userList\\'
#  列出省份目录
prov_path2 = os.listdir(ori_path2)
for prov2 in prov_path2:
    if(prov2.__contains__("province")):
        provName2 = prov2[9:]
        provCode2 = prov_dict[provName2]
        desPath2 = ori_path2 + '\\' + prov2  # 获取旧的文件路径和名称
        oldFile2 = os.listdir(desPath2)[0]
        oldName2= os.path.join(desPath2 + "\\"+oldFile2)
        # 此处需要增加文件大小判断，并自动拆分成多个文件，文件名指定从0001增长到000n。。。
        newFile2 = "附件2_"+file_today+"-05-"+str(provCode2)+"-0001_"+key_word2+".csv"
        newName2 = os.path.join(desPath2 + "\\" + newFile2)  # 修改之后文件名
        os.rename(oldName2, newName2)

des_path1 = 'C:\\Users\\Dell\\Desktop\\'+data_month+'\\attachmentone'
move_to_root_folder(des_path1, ori_path)
des_path2 = 'C:\\Users\\Dell\\Desktop\\'+data_month+'\\attachmenttwo'
move_to_root_folder(des_path2, ori_path2)


# 生成工单
province = ["安徽", "北京", "重庆", "福建", "广东", "甘肃", "广西", "贵州", "河南",
            "湖北", "河北", "海南", "黑龙江", "湖南", "吉林", "江苏", "江西", "辽宁",
            "内蒙古", "宁夏", "青海", "四川", "山东", "上海", "陕西", "山西", "天津",
            "新疆", "西藏", "云南", "浙江"]
order_combine = []
for i in province:
    provCode = prov_code_dict[i]
    provNum = prov_dict[i]
    orderId = file_today + '-05-' + str(provNum) + '-0001'
    attach1_word = "no5g_eci_month"
    attach2_word = "indoor_quality"
    attach1 = "附件1_" + file_today + "-05-" + str(provNum) + "-0001_" + attach1_word + ".csv"
    attach2 = "附件2_" + file_today + "-05-" + str(provNum) + "-0001_" + attach2_word + ".csv"
    order = {
        "createUsername": "张思为",
        "createMobile": "13811784627;zhangsiwei@chinamobile.com",
        "qdWsid": orderId,
        "decideTime": file_sec,
        "provinceCode": provCode,
        "provinceName": i,
        "qdRegion": "",
        "qdType": "分析数据",
        "qdMajor": "移动业务",
        "wsHintInfo": "移动业务-分析数据工单-转质量管理专业",
        "relatedCounts": "",
        "relatedValuableCounts": "",
        "taskLevel": "一般",
        "factoryHandleLimitTime1": "14",
        "decideRule": "具体分析要求，总部将与各省质量管理专业同事沟通，本工单仅派发分析数据，不作具体修复。",
        "repairRule": "核心规则见附件，请各省参考。请将本工单转发省内质量管理负责同事。",
        "topCause": "",
        "relatedImportantCounts": "",
        "attachList1": [
         {
             "fileName": attach1,
             "fileUrl": "/ftpdata/user/eoms/attachments/"
         },
         {
             "fileName": attach2,
             "fileUrl": "/ftpdata/user/eoms/attachments/"
         },
         {
            "fileName": "数据统计规则.docx",
            "fileUrl": "/ftpdata/user/eoms/attachments/"
         }
        ]
    }
    order_combine.append(order)
order_name = 'sysk_workorder_'+file_min+'.txt'
order_path = 'C:\\Users\\Dell\\Desktop\\'+order_name
json.dump(order_combine, open(order_path,'w'),ensure_ascii=False,indent=None)
print("done")

八、数据处理：按行分割文件并保留表头

# 按照行数分割附件,并保留表头
import pandas as pd
import os
import chardet

def turn(file):
    with open(file, 'rb') as f:
        data = f.read()
        encoding = chardet.detect(data)['encoding']
        data_str = data.decode(encoding)
        tp = 'LF'
        if '\r\n' in data_str:
            tp = 'CRLF'
            data_str = data_str.replace('\r\n', '\n')
        if encoding not in ['utf-8', 'ascii'] or tp == 'CRLF':
            with open(file, 'w', newline='\n', encoding='utf-8') as f:
                f.write(data_str)
            print(f"{file}: ({tp},{encoding}) trun to (LF,utf-8) success!")

def SplitExcel(file, num):
    file_dir = 'C:\\Users\\Dell\\Desktop\\result'  # 创建目录
    if os.path.isdir(file_dir):
        os.rmdir(file_dir)
    else:
        os.mkdir(file_dir)
    n = 1
    row_list = []
    df = pd.DataFrame(pd.read_csv(file))
    row_num = int(df.shape[0])  # 获取行数
    if num >= row_num:  # 如果分割行数大于总行数，报错
        raise Exception('too much!!')
    try:
        for i in list(range(num, row_num, num)):
            row_list.append(i)
        row_list.append(row_num)  # 得到完整列表
    except Exception as e:
        print(e)

    (name, ext) = os.path.splitext(file)  # 获取文件名

    for m in row_list:

        filename = os.path.join(file_dir, name + '_' + str(n) + '.csv')
        if m < row_num:
            df_handle = df.iloc[m - num:m]  # 获取n行之前
            print(df_handle)
            df_handle.to_csv(filename, index=False)
            turn(filename)
        elif m == int(row_num):
            remainder = int(int(row_num) % num)  # 余数
            df_handle = df.iloc[m - remainder:m]  # 获取最后不能整除的行
            df_handle.to_csv(filename, index=False)
            turn(filename)
        n = n + 1

prov_dict = {"安徽": 551, "北京": 100, "重庆": 230, "福建": 591, "广东": 200, "甘肃": 931, "广西": 771,
             "贵州": 851, "河南": 371, "湖北": 270,"河北": 311, "海南": 898, "黑龙江": 451, "湖南": 731,
             "吉林": 431, "江苏": 250, "江西": 791, "辽宁": 240, "内蒙古": 471,"宁夏": 951, "青海": 971,
             "四川": 280, "山东": 531, "上海": 210, "陕西": 290, "山西": 351, "天津": 220, "新疆": 991,
             "西藏": 891, "云南": 871, "浙江": 571}
if __name__ == '__main__':
    province = "湖北"
    provCode = prov_dict[province]
    fileName = "C:\\Users\\Dell\\Desktop\\202205\\attachmentone\\附件1_20220519-05-" + str(provCode) + "-0001_no5g_eci_month.csv"
    # file = 'result.xls'
    SplitExcel(fileName, num=730000)
    print("done")

标签：file,Python,app,更新,df,实例,str,import,data
来源： https://www.cnblogs.com/leo-lai/p/16288359.html

本站声明： 1. iCode9 技术分享网（下文简称本站）提供的所有内容，仅供技术学习、探讨和分享；
2. 关于本站的所有留言、评论、转载及引用，纯属内容发起人的个人观点，与本站观点和立场无关；
3. 关于本站的所有言论和文字，纯属内容发起人的个人观点，与本站观点和立场无关；
4. 本站文章均是网友提供，不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属；如您发现该文章侵犯了您的权益，可联系我们第一时间进行删除；
5. 本站为非盈利性的个人网站，所有内容不会用来进行牟利，也不会利用任何形式的广告来间接获益，纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

ICode9