Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix#347 add resource usage page #358

Open
wants to merge 23 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e8172e5
Merge branch 'fix#336_complete_pod_detail' into fix#347_add_resource_…
Dec 12, 2016
c860849
基本完成了计算资源用量的后台开发
Dec 13, 2016
2565b0b
基本完成资源用量前端页面的实现
Dec 14, 2016
42dddde
前端页面样式微调
Dec 14, 2016
70097b0
修复资源用量页面展示的数据(之前是展示指标的usage数据,但实际上应该展示request)
Dec 19, 2016
00f359f
修复计算用量时整数除得0的bug
Dec 19, 2016
eaebad2
资源用量后台重构
Dec 19, 2016
f6ff3f0
资源用量后台重构
Dec 19, 2016
19b1ccb
解决后台python的一个函数调用失败的bug
Dec 19, 2016
3b4c44b
前端页面小数进行四舍五入
Dec 19, 2016
f3844a0
对资源用量页面进行拆分
Dec 26, 2016
3cb8845
资源用量-账单页面的调整
Dec 28, 2016
9a52d79
提交月份选择组件
Dec 28, 2016
132baed
调整了后台计算资源用量的精度
Dec 28, 2016
1bbb9a2
重构了资源用量-近期页面
Dec 29, 2016
b0eb7d8
随Dropdown组件的bug修复所做的一些调整
Dec 30, 2016
8a23fc1
月资源用量统计内容算法的调整
Dec 30, 2016
b354b58
初步完成推送数据到运维的脚本开发
Jan 4, 2017
b9cdb1a
Sirius启动时,启动定时任务向运维推送数据
Jan 5, 2017
56ed4c3
将计算hdfs配额信息的线程提取出来,单独执行
Jan 6, 2017
c22afb7
修复因为Aries.sh阻塞导致sumspace无法执行的bug
Jan 6, 2017
f83249c
Merge branch 'dev' into fix#347_add_resource_usage_page
Jan 6, 2017
128732b
去掉“资源用量-近期”页面当天数据的展示
Jan 6, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions Aries/Aries/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
yaml_file = open(file_name)
OPENSTACK_KEY_PATH = os.path.join(BASE_DIR,"openstack/middleware/common/key.yaml")
SETTINGS = yaml.load(yaml_file)
print SETTINGS
# print SETTINGS

# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.8/howto/deployment/checklist/
Expand Down Expand Up @@ -152,8 +152,14 @@
'level':'DEBUG',
'class':'logging.FileHandler',
'formatter': 'complete',
'filename' :'{0}/service.log'.format(LOG_BASE_DIR).replace('\\','/')
'filename' :'{0}/kd_agent.log'.format(LOG_BASE_DIR).replace('\\','/')
},
'kd_agent_pushclusterinfo_file': {
'level':'DEBUG',
'class':'logging.FileHandler',
'formatter': 'complete',
'filename' :'{0}/kd_agent_pushclusterinfo.log'.format(LOG_BASE_DIR).replace('\\','/')
},
'openstack_log': {
'level': 'DEBUG',
'class': 'logging.FileHandler',
Expand Down Expand Up @@ -194,6 +200,11 @@
'kd_agent_log': {
'handlers':['kd_agent_file','console'],
'propagate': False,
'level':'INFO',
},
'kd_agent_pushclusterinfo_log': {
'handlers':['kd_agent_pushclusterinfo_file','console'],
'propagate': False,
'level':'DEBUG',
},
'openstack_log': {
Expand Down Expand Up @@ -310,10 +321,7 @@
PORT_CINDER = OPENSTACK_SETTINGS["PORT_CINDER"]
MONITOR_URL = OPENSTACK_SETTINGS['MONITOR_URL']

#启动一个线程开始定时统计配额. default: 10m
POLL_TIME = 600
import sumSpace
sumSpace.run(POLL_TIME)


#admin页面白名单IP
WHITELIST_SETTINGS = SETTINGS['WHITELIST']
Expand Down
Empty file.
Empty file.
19 changes: 19 additions & 0 deletions Aries/hdfs/management/commands/calcsumspace.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# -*- coding: UTF-8 -*-

from django.core.management.base import BaseCommand

from Aries import sumSpace

POLL_TIME = 600

#启动一个线程开始定时统计配额. default: 10m

class Command(BaseCommand):
help = 'start a thread to calc statistical quota every 10 minutes'

# 由于该脚本执行的命令较为简单,因此不接受参数
def add_arguments(self, parser):
return None

def handle(self, *args, **options):
sumSpace.run( POLL_TIME )
Empty file.
Empty file.
119 changes: 119 additions & 0 deletions Aries/kd_agent/management/commands/pushk8sdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# -*- coding: UTF-8 -*-

import traceback
import datetime
import logging
import os
import requests
import json

from django.core.management.base import BaseCommand, CommandError
from django.db import IntegrityError

from kd_agent.toolsmanager import RETU_INFO_SUCCESS,RETU_INFO_ERROR
from kd_agent.toolsmanager import generate_success,generate_failure
from kd_agent.models import ResourceUsageDailyCache as RUDC
from kd_agent.models import NamespaceDepartmentRef as NDR
from kd_agent.models import ClusterUsagePushFailureRecords as CUPFR
from kd_agent.views import get_resource_usage_info


logger = logging.getLogger("kd_agent_pushclusterinfo_log")


# 将需要推送的数据的关键信息放到失败记录表中(ClusterUsagePushFailureRecords),之后再统一推送
def refresh_failure_record():
date = datetime.datetime.combine( datetime.datetime.now(),datetime.time() )
yesterday = date - datetime.timedelta(seconds=24*60*60)

# 在数据库中插入记录,然后统一根据记录来往运维推送
for record in NDR.objects.all():
try:
CUPFR( namespace=record,datetime=yesterday ).save()
logger.debug( 'insert undo record(%s,%s) success' % (record.namespace,yesterday) )
except IntegrityError: # (主键重复的异常)如果数据库中已经存在了这条记录,则该异常可以直接忽略
pass
except:
logger.error( 'insert undo record(%s,%s) failure : %s' % (record.namespace,yesterday,traceback.format_exc()) )

def get_push_url():
return 'http://li.app/v1/om/source/Sirius/addSirius'

def push_data():
for record in CUPFR.objects.all():
try:
namespace = record.namespace.namespace
department = record.namespace.department

# django存到mysql的datetime对象是不带有时区的,
# # 因此这里为了方便处理,直接把不含时区的datetime对象转换为含有时区(本地时区)的datetime对象
date = record.datetime+datetime.timedelta(seconds=8*60*60)
date = datetime.datetime.strptime( date.strftime('%Y-%m-%d'),'%Y-%m-%d' )

retu_data = push_identify_data(date,namespace,department)
if retu_data['code'] == RETU_INFO_SUCCESS:
record.delete()
logger.debug('push_identify_data(%s,%s,%s) success' % (date,namespace,department))
else:
logger.error('push_identify_data(%s,%s,%s) failure : %s' % (date,namespace,department,retu_data['msg']))
except:
logger.error( 'push_identify_data(%s,%s,%s) raise exception : %s' % (date,namespace,department,traceback.format_exc()) )

def push_identify_data(date,namespace,department):
retu_data = get_resource_usage_info( date,namespace )
if retu_data['code'] != RETU_INFO_SUCCESS:
s = 'get_resource_usage_info(%s,%s) failure : %s' % (date,namespace,retu_data['msg'])
return generate_failure( s )
return push_http(date,department,retu_data['data']['request'])

'''
接口所接受的post数据的格式:
usage:[{
department:'基础研发部' 标识部门名称的字符串,可能是二级部门、一级部门、中心的名字
date:'2016-12-01' 标识该记录是哪个时间段的数据统计汇总出来的( 2016-12-01 00:00:00:000 至 2016-12-01 59:59:59:999 )
usage:'11.03' 标识该部门在这天的机器用量,单位是 机器/天
},{
...
}]

备注:接口支持一次性传输多条记录,但是我这里为了方便,每次只传输一条记录
'''
def push_http(date,department,usage):
post_data = {
'usage':[{
'department':department,
'date':date.strftime('%Y-%m-%d'),
'usage':str(usage)
}]
}
req = requests.post(get_push_url(), data=json.dumps(post_data))
if req.status_code != requests.codes.ok:
s = 'requests.post(%s,%s) return req.status_code is not requests.codes.ok' % \
( get_push_url(),json.dumps(post_data) )
return generate_failure( s )

retu_obj = req.json()
if retu_obj['status'] == True:
return generate_success()
else:
s = 'requests.post(%s,%s) return status is not True : %s' % ( get_push_url(),json.dumps(post_data),retu_obj )
return generate_failure( s )


class Command(BaseCommand):
help = 'Push k8s cluster usage to operation'

# 由于该脚本执行的命令较为简单,因此不接受参数
def add_arguments(self, parser):
return None

def handle(self, *args, **options):
command_str = str(__file__)
command_str = os.path.split(command_str)[1]
command_str = os.path.splitext(command_str)[0]
try:
refresh_failure_record()
push_data()
logger.info( 'execute command %s success' % command_str )
except:
logger.error( 'execute command %s failure : \n%s' % (command_str,traceback.format_exc()) )
116 changes: 116 additions & 0 deletions Aries/kd_agent/models.py
Original file line number Diff line number Diff line change
@@ -1 +1,117 @@
# -*- coding: UTF-8 -*-

from django.db import models
import math


from kd_agent.toolsmanager import InfluxDBQueryStrManager as ISM

def calc_minute_ave(v):
return float(v)/(24*60)

# 建库语句
# create database DecisionMakingSurvey default charset utf8 collate utf8_unicode_ci;

class ResourceUsageDailyCache(models.Model):
datetime = models.DateTimeField()
namespace = models.CharField( max_length=255 )

cpu_request = models.BigIntegerField( default=0 )
cpu_limit = models.BigIntegerField( default=0 )
cpu_usage = models.BigIntegerField( default=0 )

# 这里的memory的workingset指标没有被保存。如果有需要,这里再保存一下即可
memory_request = models.BigIntegerField( default=0 )
memory_limit = models.BigIntegerField( default=0 )
memory_usage = models.BigIntegerField( default=0 )

# 注意,上面的 cpu、memory 指标数据是相对比较原始的数据,即将 1 天内 1440 个分钟的采样数据全部加起来
# 但是下面的 request、limit、usage 数据是经过计算之后得到的(calc_virtual_machine_day)

request = models.FloatField()
limit = models.FloatField()
usage = models.FloatField()

# 方便地生成一个对象
# data_json 是一个json对象,其中的key应该与 ISM 中定义的一致,如下
# ISM.M_CPU_USAGE
# ISM.M_CPU_LIMIT
# ISM.M_CPU_REQUEST
# ISM.M_MEMORY_USAGE
# ISM.M_MEMORY_LIMIT
# ISM.M_MEMORY_REQUEST
@staticmethod
def generate_obj_by_measurement_key( datetime,namespace,data_json ):
keys_map = {
ISM.M_CPU_REQUEST:'cpu_request',
ISM.M_CPU_LIMIT:'cpu_limit',
ISM.M_CPU_USAGE:'cpu_usage',
ISM.M_MEMORY_REQUEST:'memory_request',
ISM.M_MEMORY_LIMIT:'memory_limit',
ISM.M_MEMORY_USAGE:'memory_usage',
}
obj = {}
for k,v in keys_map.items():
obj[ v ] = data_json.get(k,0)
return ResourceUsageDailyCache.generate_obj_by_base_keys( datetime,namespace,obj )

@staticmethod
def generate_obj_by_base_keys( datetime,namespace,obj ):
CVMD = ResourceUsageDailyCache.calc_virtual_machine_day
# 根据提供的cpu、memory、usage数据来计算资源用量
obj['usage'] = CVMD( obj['cpu_usage'],obj['memory_usage'] )
obj['limit'] = CVMD( obj['cpu_limit'],obj['memory_limit'] )
obj['request'] = CVMD( obj['cpu_request'],obj['memory_request'] )
return ResourceUsageDailyCache( datetime=datetime,namespace=namespace,**obj )

def to_minuteaverge_measurementkey_json(self):
return {
ISM.M_CPU_REQUEST:calc_minute_ave(self.cpu_request),
ISM.M_CPU_LIMIT:calc_minute_ave(self.cpu_limit),
ISM.M_CPU_USAGE:calc_minute_ave(self.cpu_usage),
ISM.M_MEMORY_REQUEST:calc_minute_ave(self.memory_request),
ISM.M_MEMORY_LIMIT:calc_minute_ave(self.memory_limit),
ISM.M_MEMORY_USAGE:calc_minute_ave(self.memory_usage),
'request':self.request,
'limit':self.limit,
'usage':self.usage,
}

# u 多少个0.5VCPU (1VCPU == cpu/1000 ,0.5VCPU是预设的值)
# v 多少个128MB内存 ( 128MB是预设的值 )
# 计算公式为: u*0.025 + 0.003*v / (8*u)
# 结果保留11位有效小数(1e-11)(且直接进位) 即如果结果是 1.11e-11 则,应该显示 1.2e-11
@staticmethod
def calc_virtual_machine_day( cpu_value,memory_value ):

u = calc_minute_ave(cpu_value)/1000/0.5
v = calc_minute_ave(memory_value)/128/1024/1024
try:
value = u*0.025 + 0.003*v / (8*u)
except:
value = 0

# 先放大1e11倍,然后向上取整。之后再缩小1e11倍。由于缩小之后,获取的数不会严格1e11倍,因此round一下
# 如:
# >>> 1.3/10000
# 0.00013000000000000002
d = 1e-11
return round( math.ceil(value/d)*d,11 )


# 该表中记录namespace到部门名的一个对照,只用做向运维推送集群每天某个namespace的资源用量
# 由于推送数据与Sirius的主要业务不相关,因此这里会尽量降低它与Sirius业务相关表的耦合度
class NamespaceDepartmentRef(models.Model):
namespace = models.CharField( max_length=255,primary_key=True )
department = models.CharField( max_length=1024 )

# 该表是为了记录推送的状态。比如某天的数据推送失败,则该表中将会有一条push_success为False的记录
# 等到下次推送的时候,将会查找这些记录并重新推送
class ClusterUsagePushFailureRecords(models.Model):
namespace = models.ForeignKey( NamespaceDepartmentRef )
datetime = models.DateTimeField()
class Meta:
unique_together = ("namespace", "datetime")



37 changes: 37 additions & 0 deletions Aries/kd_agent/toolsmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,12 @@ class InfluxDBQueryStrManager:
WHERE "type" = '{type}' AND "pod_namespace" = '{namespace}' AND "pod_name"='{pod_name}' AND time > {time_start} and time < {time_end}
GROUP BY time(1m) fill(null)'''

# 这里本来应该直接 group by time(1440m) (汇总1天数据),但是不知道为什么汇总出来的是2条数据
# 因此就降一级,按照小时汇总数据,然后吧所有数据都加起来
SQL_NAMESPACE_RESOURCE_USAGE = '''SELECT sum("value") FROM "{measurement}"
WHERE "type" = '{type}' AND "pod_namespace" = '{namespace}' AND time >= {time_start} and time < {time_end}
GROUP BY time(60m) fill(null)'''


M_CPU_USAGE = 'cpu/usage_rate'
M_CPU_LIMIT = 'cpu/limit'
Expand Down Expand Up @@ -169,6 +175,16 @@ def format_namespace_poddetail_query_str(measurement,time_start ,time_end ,names
type=InfluxDBQueryStrManager.T_POD,
pod_name=pod_name)

@staticmethod
def format_namespace_resourceusage_query_str(measurement,time_start ,time_end ,namespace ):
return InfluxDBQueryStrManager.SQL_NAMESPACE_RESOURCE_USAGE.format(
measurement=measurement,
time_start=time_start,
time_end=time_end,
namespace=namespace,
type=InfluxDBQueryStrManager.T_POD)



@staticmethod
def get_measurement_disname_dict():
Expand Down Expand Up @@ -270,6 +286,27 @@ def get_namespace_poddetail_data( measurement,time_start,time_end,namespace,pod_
kd_logger.error( traceback_str )
return generate_failure( traceback_str )

@staticmethod
def get_namespace_resourceusage_data( measurement,time_start,time_end,namespace ):
kd_logger.info( 'call get_namespace_resourceusage_data with args : %s %s %s %s' % (measurement,time_start,time_end,namespace) )
try:
sql_str = InfluxDBQueryStrManager.format_namespace_resourceusage_query_str(
measurement=measurement,
time_start='%ss' % time_start ,
time_end='%ss' % time_end,
namespace=namespace)
kd_logger.info( 'generate sql_str : %s' % (sql_str) )

retu_data = InfluxDBQueryStrManager.get_influxdb_data(sql_str=sql_str)
if retu_data['code'] == RETU_INFO_SUCCESS:
kd_logger.debug( 'get influxdb data by sql_str return data : %s' % retu_data['data'] )
else:
kd_logger.error( 'get influxdb data by sql_str return error : %s' % retu_data['msg'] )
return retu_data
except:
traceback_str = traceback.format_exc()
kd_logger.error( traceback_str )
return generate_failure( traceback_str )



2 changes: 2 additions & 0 deletions Aries/kd_agent/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
url(r'^api/namespaces/(?P<namespace>\w{1,64})/replicationcontrollers/downloadjson$',views.download_rc_json),
url(r'^apis/extensions/v1beta1/namespaces/(?P<namespace>\w{1,64})/ingresses/downloadjson$',views.download_ingress_json ),

url(r'^api/namespaces/(?P<namespace>\w{1,64})/resourceusage$',views.resource_usage),

url(r'^api/namespaces/mytaskgraph$', views.get_mytask_graph),
url(r'^download/$', views.download),
url(r'^api/namespaces/mytasklist/getoldrecords', views.mytask_get_old_records),
Expand Down
Loading