ffrt对接hisysevent

Signed-off-by: linyu <yulin24@huawei.com>
This commit is contained in:
linyu 2024-08-21 09:33:52 +08:00
parent e56fd28f25
commit f9cd177bfe
13 changed files with 186 additions and 58 deletions

View File

@ -58,6 +58,7 @@ config("ffrt_config") {
"FFRT_TRACE_LEVEL=1",
"QOS_FRAME_RTG",
"FFRT_TRACE_RECORD_LEVEL=1",
"FFRT_SEND_EVENT",
]
#ffrt_release_defines = [
@ -120,6 +121,7 @@ ohos_shared_library("libffrt") {
"src/dfx/bbox/fault_logger_fd_manager.cpp",
"src/dfx/dump/dump.cpp",
"src/dfx/log/ffrt_log.cpp",
"src/dfx/sysevent/sysevent.cpp",
"src/dfx/trace/ffrt_trace.cpp",
"src/dfx/trace_record/ffrt_trace_record.cpp",
"src/dfx/watchdog/watchdog_util.cpp",

View File

@ -1,34 +1,48 @@
# Copyright (c) 2023 Huawei Device Co., Ltd.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Copyright (c) 2024 Huawei Device Co., Ltd.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
domain: FFRT
SERIAL_TASK_TIMEOUT:
__BASE: {type: FAULT, level: CRITICAL, tag: STABILITY, desc: ffrt task timeout}
PID: {type: INT32, desc: failure process pid}
TGID: {type: INT32, desc: failure process gid}
UID: {type: INT32, desc: failure process uid}
MODULE_NAME: {type: STRING, desc: failure module name}
PROCESS_NAME: {type: STRING, desc: process name}
MSG: {type: STRING, desc: event message}
STACK: {type: STRING, desc: stacktrace of ffrt tasks}
AI_KERN_POWER_UP_ERR:
__BASE: {type: FAULT, level: MINOR, desc: AI KERNEL ipu fail or status invalid, preserve: true}
ERROR_LEVEL: {type: INT32, desc: error level}
IC_NAME: {type: STRING, desc: IC name}
MODULE_NAME: {type: STRING, desc: module name}
DESCRIPTION: {type: STRING, desc: description}
COUNT: {type: INT32, desc: report counts}
CONTENT: {type: STRING, desc: fault message}
DEVICE_NAME: {type: STRING, desc: device name}
RUNNING_TEST_SWITCH: {type: BOOL, desc: running_test_switch}
FAULT_PHENOMENON: {type: STRING, desc: fault_phenomenon}
NFF_THRESHOLD_MOUTH: {type: INT32, desc: nff_threshold_mouth}
NFF_THRESHOLD_DAY: {type: INT32, desc: nff_threshold_day}
TASK_DEADLOCK:
__BASE: {type: FAULT, level: CRITICAL, tag: STABILITY, desc: ffrt task dead lock}
PID: {type: INT32, desc: failure process pid}
TGID: {type: INT32, desc: failure process gid}
UID: {type: INT32, desc: failure process uid}
MODULE_NAME: {type: STRING, desc: failure module name}
AI_KERN_WTD_TIMEOUT_ERR:
__BASE: {type: FAULT, level: MINOR, desc: AI KERNEL ipu soft wtd timeout, preserve: true}
ERROR_LEVEL: {type: INT32, desc: error level}
IC_NAME: {type: STRING, desc: IC name}
MODULE_NAME: {type: STRING, desc: module name}
DESCRIPTION: {type: STRING, desc: description}
COUNT: {type: INT32, desc: report counts}
CONTENT: {type: STRING, desc: fault message}
DEVICE_NAME: {type: STRING, desc: device name}
RUNNING_TEST_SWITCH: {type: BOOL, desc: running_test_switch}
FAULT_PHENOMENON: {type: STRING, desc: fault_phenomenon}
NFF_THRESHOLD_MOUTH: {type: INT32, desc: nff_threshold_mouth}
NFF_THRESHOLD_DAY: {type: INT32, desc: nff_threshold_day}
TASK_TIMEOUT:
__BASE: {type: FAULT, level: CRITICAL, desc: task timeout, preserve: true}
SENARIO: {type: STRING, desc: senario}
PROCESS_NAME: {type: STRING, desc: process name}
MSG: {type: STRING, desc: event message}
STACK: {type: STRING, desc: stacktrace of ffrt tasks}
MSG: {type: STRING, desc: message}

View File

@ -31,7 +31,8 @@ typedef enum {
ffrt_normal_task = 0,
ffrt_io_task = 1,
ffrt_uv_task, // only used to register func for libuv
ffrt_queue_task
ffrt_queue_task,
ffrt_invalid_task
} ffrt_executor_task_type_t;
typedef void (*ffrt_executor_task_func)(ffrt_executor_task_t* data, ffrt_qos_t qos);

View File

@ -19,6 +19,7 @@ file(GLOB_RECURSE FFRT_SRC_LIST
"${FFRT_CODE_PATH}/dfx/log/ffrt_log.cpp"
"${FFRT_CODE_PATH}/dfx/log/${FFRT_LOG_PLAT}/log_base.cpp"
"${FFRT_CODE_PATH}/dfx/dump/dump.cpp"
"${FFRT_CODE_PATH}/dfx/sysevent/sysevent.cpp"
"${FFRT_CODE_PATH}/dfx/watchdog/watchdog_util.cpp"
)

View File

@ -0,0 +1,17 @@
#ifdef FFRT_SEND_EVENT
#include "sysevent.h"
#include "hisysevent.h"
namespace ffrt {
void TaskTimeoutReport(std::stringstream& ss, std::string& processNameStr, std::string& senarioName)
{
std::string msg = ss.str();
std::string eventName = "TASK_TIMEOUT";
time_t cur_time = time(nullptr);
std::string sendMsg = std::string((ctime(&cur_time) == nullptr) ? "" : ctime(&cur_time)) + "\n" + msg + "\n";
HiSysEventWrite(OHOS::HiviewDFX::HiSysEvent::Domain::FFRT, eventName,
OHOS::HiviewDFX::HiSysEvent::EventType::FAULT, "SENARIO", senarioName,
"PROCESS_NAME", processNameStr, "MSG", sendMsg);
}
}
#endif

View File

@ -0,0 +1,25 @@
/*
* Copyright (c) 2023 Huawei Device Co., Ltd.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef __FFRT_SYSEVENT_H__
#define __FFRT_SYSEVENT_H__
#include <sstream>
#include <string>
namespace ffrt {
#ifdef FFRT_SEND_EVENT
void TaskTimeoutReport(std::stringstream& ss, std::string& processNameStr, std::string& senarioName);
#endif
}
#endif

View File

@ -106,13 +106,18 @@ void CPUWorker::RunTask(ffrt_executor_task_t* curtask, CPUWorker* worker)
auto ctx = ExecuteCtx::Cur();
CPUEUTask* task = reinterpret_cast<CPUEUTask*>(curtask);
worker->curTask = task;
worker->curTaskType_ = task->type;
switch (curtask->type) {
case ffrt_normal_task:
case ffrt_queue_task: {
worker->curTaskLabel_ = task->label;
worker->curTaskGid_ = task->gid;
ctx->task = task;
ctx->lastGid_ = task->gid;
Run(task);
ctx->task = nullptr;
worker->curTaskLabel_ = "";
worker->curTaskGid_ = UINT64_MAX;
break;
}
default: {
@ -123,6 +128,7 @@ void CPUWorker::RunTask(ffrt_executor_task_t* curtask, CPUWorker* worker)
}
}
worker->curTask = nullptr;
worker->curTaskType_ = ffrt_invalid_task;
}
void CPUWorker::RunTaskLifo(ffrt_executor_task_t* task, CPUWorker* worker)

View File

@ -39,6 +39,10 @@ const std::vector<uint64_t> FFRT_RETRY_CYCLE_LIST = {
class WorkerThread {
public:
CPUEUTask* curTask = nullptr;
uintptr_t curTaskType_ = ffrt_invalid_task;
std::string curTaskLabel_ = "";
uint64_t curTaskGid_ = UINT64_MAX; //仅当TaskType为normal task或者queue task时label和gid才能正常读取
explicit WorkerThread(const QoS& qos);
virtual ~WorkerThread()

View File

@ -25,6 +25,7 @@
#include "sched/scheduler.h"
namespace {
constexpr int PROCESS_NAME_BUFFER_LENGTH = 1024;
constexpr uint32_t STRING_SIZE_MAX = 128;
constexpr uint32_t TASK_DONE_WAIT_UNIT = 10;
}
@ -298,20 +299,16 @@ void QueueHandler::SetTimeoutMonitor(QueueTask* task)
void QueueHandler::RunTimeOutCallback(QueueTask* task)
{
std::stringstream ss;
ss << "serial queue [" << name_ << "] queueId=" << GetQueueId() << ", serial task gid=" << task->gid <<
" execution time exceeds " << timeout_ << " us";
std::string msg = ss.str();
std::string eventName = "SERIAL_TASK_TIMEOUT";
#ifdef FFRT_SEND_EVENT
time_t cur_time = time(nullptr);
std::string sendMsg = std::string((ctime(&cur_time) == nullptr) ? "" : ctime(&cur_time)) + "\n" + msg + "\n";
HiSysEventWrite(OHOS::HiviewDFX::HiSysEvent::Domain::FFRT, eventName,
OHOS::HiviewDFX::HiSysEvent::EventType::FAULT, "PID", getpid(), "TGID", getgid(), "UID", getuid(),
"MODULE_NAME", "ffrt", "PROCESS_NAME", "ffrt", "MSG", sendMsg);
#endif
FFRT_LOGE("[%s], %s", eventName.c_str(), msg.c_str());
static std::once_flag flag;
static char processName[PROCESS_NAME_BUFFER_LENGTH];
std::call_once(flag, []() {
GetProcessName(processName, PROCESS_NAME_BUFFER_LENGTH);
});
std::string processNameStr = std::string(processName);
ss << "[Serial_Queue_Timeout_Callback] process name:[" << processNameStr << "], serial queue:[" <<
name_ << "], queueId:[" << GetQueueId() << "], serial task gid:[" << task->gid << "], task name:["
<< task->label << "], execution time exceeds[" << timeout_ << "] us";
FFRT_LOGE("%s", ss.str().c_str());
if (timeoutCb_ != nullptr) {
timeoutCb_->exec(timeoutCb_);
}

View File

@ -18,8 +18,11 @@
#include "util/slab.h"
#include "sync/sync.h"
#include "c/ffrt_dump.h"
#include "dfx/sysevent/sysevent.h"
#include "internal_inc/osal.h"
namespace {
constexpr int PROCESS_NAME_BUFFER_LENGTH = 1024;
constexpr uint32_t INVALID_TASK_ID = 0;
constexpr uint32_t TIME_CONVERT_UNIT = 1000;
constexpr uint64_t QUEUE_INFO_INITIAL_CAPACITY = 64;
@ -39,6 +42,7 @@ QueueMonitor::QueueMonitor()
FFRT_LOGI("queue monitor ctor enter");
queuesRunningInfo_.reserve(QUEUE_INFO_INITIAL_CAPACITY);
queuesStructInfo_.reserve(QUEUE_INFO_INITIAL_CAPACITY);
lastReportedTask_.reserve(QUEUE_INFO_INITIAL_CAPACITY);
we_ = new (SimpleAllocator<WaitUntilEntry>::AllocMem()) WaitUntilEntry();
uint64_t timeout = ffrt_task_timeout_get_threshold() * TIME_CONVERT_UNIT;
if (timeout < MIN_TIMEOUT_THRESHOLD_US) {
@ -79,6 +83,7 @@ void QueueMonitor::RegisterQueueId(uint32_t queueId, QueueHandler* queueStruct)
if (queueId == queuesRunningInfo_.size()) {
queuesRunningInfo_.emplace_back(std::make_pair(INVALID_TASK_ID, std::chrono::steady_clock::now()));
queuesStructInfo_.emplace_back(queueStruct);
lastReportedTask_.emplace_back(INVALID_TASK_ID);
FFRT_LOGD("queue registration in monitor gid=%u in turn succ", queueId);
return;
}
@ -88,6 +93,7 @@ void QueueMonitor::RegisterQueueId(uint32_t queueId, QueueHandler* queueStruct)
for (uint32_t i = queuesRunningInfo_.size(); i <= queueId; ++i) {
queuesRunningInfo_.emplace_back(std::make_pair(INVALID_TASK_ID, std::chrono::steady_clock::now()));
queuesStructInfo_.emplace_back(nullptr);
lastReportedTask_.emplace_back(INVALID_TASK_ID);
}
queuesStructInfo_[queueId] = queueStruct;
}
@ -103,6 +109,7 @@ void QueueMonitor::ResetQueueInfo(uint32_t queueId)
FFRT_COND_DO_ERR((queuesRunningInfo_.size() <= queueId), return,
"ResetQueueInfo queueId=%u access violation, RunningInfo_.size=%u", queueId, queuesRunningInfo_.size());
queuesRunningInfo_[queueId].first = INVALID_TASK_ID;
lastReportedTask_[queueId] = INVALID_TASK_ID;
}
void QueueMonitor::ResetQueueStruct(uint32_t queueId)
@ -193,13 +200,21 @@ void QueueMonitor::CheckQueuesStatus()
if (taskTimestamp < startThreshold) {
std::stringstream ss;
ss << "SERIAL_TASK_TIMEOUT: serial queue qid=" << i << ", serial task gid=" << taskId << " execution " <<
timeoutUs_ << " us.";
char processName[PROCESS_NAME_BUFFER_LENGTH];
GetProcessName(processName, PROCESS_NAME_BUFFER_LENGTH);
ss << "Serial_Queue_Timeout, process name:[" << processNameStr << "], serial queue qid:[" << i
<< "], serial task gid:[" << taskId << "], execution:[" << timeoutUs_ << "] us.";
if (queuesStructInfo_[i] != nullptr) {
ss << queuesStructInfo_[i]->GetDfxInfo();
}
FFRT_LOGE("%s", ss.str().c_str());
#ifdef FFRT_SEND_EVENT
if (lastReportedTask_[i] != taskId) {
lastReportedTask_[i] = taskId;
std::string processNameStr = std::string(processName);
TaskTimeoutReport(ss, processNameStr, "Serial_Queue_Timeout");
}
#endif
ffrt_task_timeout_cb func = ffrt_task_timeout_get_cb();
if (func) {
func(taskId, ss.str().c_str(), ss.str().size());

View File

@ -50,6 +50,7 @@ private:
std::vector<QueueHandler*> queuesStructInfo_;
std::atomic_bool exit_ { false };
std::atomic_bool abortSendTimer_ { false };
std::vector<uint64_t> lastReportedTask_;
};
} // namespace ffrt

View File

@ -17,11 +17,13 @@
#include <cstring>
#include <iostream>
#include <fstream>
#include <sstream>
#include <regex>
#ifdef FFRT_OH_TRACE_ENABLE
#include "backtrace_local.h"
#endif
#include "dfx/sysevent/sysevent.h"
#include "eu/execute_unit.h"
#include "eu/worker_manager.h"
#include "eu/co_routine_factory.h"
@ -29,6 +31,7 @@
#include "sched/scheduler.h"
namespace {
constexpr int HISYSEVENT_TIMEOUT_SEC = 60;
constexpr int PROCESS_NAME_BUFFER_LENGTH = 1024;
constexpr int MONITOR_SAMPLING_CYCLE_US = 500 * 1000;
constexpr int SAMPLING_TIMES_PER_SEC = 1000 * 1000 / MONITOR_SAMPLING_CYCLE_US;
@ -166,9 +169,10 @@ void WorkerMonitor::CheckWorkerStatus()
return;
}
}
std::vector<std::pair<int, int>> timeoutFunctions;
std::vector<TimeoutFunctionInfo> timeoutFunctions;
for (int i = 0; i < QoS::MaxNum(); i++) {
std::shared_lock<std::shared_mutex> lck(workerGroup[i].tgMutex);
size_t coWorkerCount = workerGroup[i].threads.size();
for (auto& thread : workerGroup[i].threads) {
WorkerThread* worker = thread.first;
CPUEUTask* workerTask = worker->curTask;
@ -177,19 +181,19 @@ void WorkerMonitor::CheckWorkerStatus()
continue;
}
RecordTimeoutFunctionInfo(worker, workerTask, timeoutFunctions);
RecordTimeoutFunctionInfo(coWorkerCount, worker, workerTask, timeoutFunctions);
}
}
for (const auto& timeoutFunction : timeoutFunctions) {
RecordSymbolAndBacktrace(timeoutFunction.first, timeoutFunction.second);
RecordSymbolAndBacktrace(timeoutFunction);
}
SubmitSamplingTask();
}
void WorkerMonitor::RecordTimeoutFunctionInfo(WorkerThread* worker, CPUEUTask* workerTask,
std::vector<std::pair<int, int>>& timeoutFunctions)
void WorkerMonitor::RecordTimeoutFunctionInfo(size_t coWorkerCount, WorkerThread* worker, CPUEUTask* workerTask,
std::vector<TimeoutFunctionInfo>& timeoutFunctions)
{
auto workerIter = workerStatus_.find(worker);
if (workerIter == workerStatus_.end()) {
@ -201,7 +205,8 @@ void WorkerMonitor::RecordTimeoutFunctionInfo(WorkerThread* worker, CPUEUTask* w
if (taskInfo.task_ == workerTask) {
int taskExecutionTime = ++taskInfo.sampledTimes_ * MONITOR_SAMPLING_CYCLE_US;
if (taskExecutionTime % TIMEOUT_RECORD_CYCLE_LIST[taskInfo.recordLevel_] == 0) {
timeoutFunctions.emplace_back(std::make_pair(worker->Id(), taskInfo.sampledTimes_));
timeoutFunctions.emplace_back(static_cast<int>(worker->GetQos()), coWorkerCount, worker->Id(),
taskInfo.sampledTimes_, worker->curTaskType_, worker->curTaskGid_, worker->curTaskLabel_);
if (taskInfo.recordLevel_ < static_cast<int>(TIMEOUT_RECORD_CYCLE_LIST.size()) - 1) {
taskInfo.recordLevel_++;
}
@ -216,20 +221,37 @@ void WorkerMonitor::RecordTimeoutFunctionInfo(WorkerThread* worker, CPUEUTask* w
workerIter->second = TaskTimeoutInfo(workerTask);
}
void WorkerMonitor::RecordSymbolAndBacktrace(int tid, int sampleTimes)
void WorkerMonitor::RecordSymbolAndBacktrace(const TimeoutFunctionInfo& timeoutFunction)
{
int sampleSeconds = (sampleTimes == 0) ? 1 : sampleTimes / SAMPLING_TIMES_PER_SEC;
FFRT_LOGW("Tid[%d] function occupies worker for more than [%d]s.", tid, sampleSeconds);
int sampleSeconds = (timeoutFunction.sampledTimes_ == 0) ? 1 : timeoutFunction.sampledTimes_ /
SAMPLING_TIMES_PER_SEC;
std::stringstream ss;
char processName[PROCESS_NAME_BUFFER_LENGTH];
GetProcessName(processName, PROCESS_NAME_BUFFER_LENGTH);
ss << "Task_Sch_Timeout: process name:[" << processName << "], Tid:[" << timeoutFunction.tid_ <<
"], Worker QoS Level:[" << timeoutFunction.qosLevel_ << "], Concurrent Worker Count:[" <<
timeoutFunction.coWorkerCount_ << "], Task Type:[" << timeoutFunction.type_ << "], ";
if (timeoutFunction.type_ == ffrt_normal_task || timeoutFunction.type_ == ffrt_queue_task) {
ss << "Task Name:[" << timeoutFunction.label_ << "], Task Id:[" << timeoutFunction.gid_ << "], ";
}
ss << "occupies worker for more than [" << sampleSeconds << "]s";
FFRT_LOGW("%s", ss.str().c_str());
#ifdef FFRT_OH_TRACE_ENABLE
std::string dumpInfo;
if (OHOS::HiviewDFX::GetBacktraceStringByTid(dumpInfo, tid, 0, false)) {
if (OHOS::HiviewDFX::GetBacktraceStringByTid(dumpInfo, timeoutFunction.tid_, 0, false)) {
FFRT_LOGW("Backtrace:\n%s", dumpInfo.c_str());
if (sampleSeconds >= RECORD_IPC_INFO_TIME_THRESHOLD) {
RecordIpcInfo(dumpInfo);
}
}
#endif
#ifdef FFRT_SEND_EVENT
if (sampleSeconds == HISYSEVENT_TIMEOUT_SEC) {
std::string processNameStr = std::string(processName);
TaskTimeoutReport(ss, processNameStr, "Task_Sch_Timeout");
}
#endif
}
void WorkerMonitor::RecordIpcInfo(const std::string& dumpInfo)

View File

@ -30,6 +30,29 @@ struct TaskTimeoutInfo {
explicit TaskTimeoutInfo(CPUEUTask* task) : task_(task) {}
};
struct TimeoutFunctionInfo {
size_t qosLevel_;
int coWorkerCount_;
int tid_;
int sampledTimes_;
uintptr_t type_;
uint64_t gid_;
std::string label_;
TimeoutFunctionInfo(size_t qosLevel, int coWorkerCount, int workerId, int sampledTimes,
uintptr_t workerTaskType, uint64_t taskId, std::string workerTaskLabel)
: qosLevel_(qosLevel), coWorkerCount_(coWorkerCount), tid_(workerId), sampledTimes_(sampledTimes),
type_(workerTaskType) {
if (type_ == ffrt_normal_task || type_ == ffrt_queue_task) {
gid_ = taskId;
label_ = workerTaskLabel;
} else {
gid_ = UINT64_MAX; //该task type 没有 gid
label_ = "Unsupport_Task_type"; //该task type 没有 label
}
}
};
class WorkerMonitor {
public:
static WorkerMonitor &GetInstance();
@ -45,9 +68,9 @@ private:
void SubmitSamplingTask();
void SubmitMemReleaseTask();
void CheckWorkerStatus();
void RecordTimeoutFunctionInfo(WorkerThread* worker, CPUEUTask* workerTask,
std::vector<std::pair<int, int>>& timeoutFunctions);
void RecordSymbolAndBacktrace(int tid, int sampleTimes);
void RecordTimeoutFunctionInfo(size_t coWorkerCount, WorkerThread* worker,
CPUEUTask* workerTask, std::vector<TimeoutFunctionInfo>& timeoutFunctions);
void RecordSymbolAndBacktrace(const TimeoutFunctionInfo& timeoutFunction);
void RecordIpcInfo(const std::string& dumpInfo);
private: