Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(log): refactor agent log impl #719

Merged
merged 7 commits into from
Jul 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,23 @@

package ai.starwhale.mlops.agent.configuration;

import ai.starwhale.mlops.agent.container.ContainerClient;
import ai.starwhale.mlops.agent.node.SourcePool;
import ai.starwhale.mlops.agent.task.Action;
import ai.starwhale.mlops.agent.task.inferencetask.AgentTaskScheduler;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceTask;
import ai.starwhale.mlops.agent.task.inferencetask.LogRecorder;
import ai.starwhale.mlops.agent.task.inferencetask.TaskPool;
import ai.starwhale.mlops.agent.task.inferencetask.AgentTaskScheduler;
import ai.starwhale.mlops.agent.task.inferencetask.executor.TaskExecutor;
import ai.starwhale.mlops.agent.task.inferencetask.initializer.TaskPoolInitializer;
import ai.starwhale.mlops.agent.task.inferencetask.persistence.FileSystemPath;
import ai.starwhale.mlops.agent.task.inferencetask.persistence.TaskPersistence;
import ai.starwhale.mlops.agent.task.log.FileLog;
import ai.starwhale.mlops.agent.task.log.LogConfigurator;
import ai.starwhale.mlops.agent.task.log.LogRecorder;
import ai.starwhale.mlops.agent.task.log.MemoryLog;
import ai.starwhale.mlops.api.protocol.report.req.ReportRequest;
import ai.starwhale.mlops.api.protocol.report.resp.ReportResponse;
import ch.qos.logback.classic.LoggerContext;
import ch.qos.logback.classic.PatternLayout;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
Expand All @@ -52,8 +56,30 @@ public TaskPool taskPool() {
}

@Bean
public LogRecorder logRecorder(ContainerClient containerClient, TaskPersistence taskPersistence) {
return new LogRecorder(containerClient, taskPersistence);
public LogRecorder logRecorder(TaskPersistence taskPersistence) {
LoggerContext loggerContext = LogConfigurator.getLoggerContext();
LogConfigurator.defaultConfigure(loggerContext);

LogRecorder logRecorder = new LogRecorder(loggerContext);
// for ui
PatternLayout consoleLayout = new PatternLayout();
consoleLayout.setContext(loggerContext);
consoleLayout.setPattern(LogConfigurator.resolve(loggerContext, "${CONSOLE_LOG_PATTERN}"));
consoleLayout.setOutputPatternAsHeader(false);
consoleLayout.start();
MemoryLog memoryLog = new MemoryLog(consoleLayout);
logRecorder.register(memoryLog);
logRecorder.registerRealtimeReader(memoryLog);

// for file
PatternLayout fileLayout = new PatternLayout();
fileLayout.setContext(loggerContext);
fileLayout.setPattern(LogConfigurator.resolve(loggerContext, "${FILE_LOG_PATTERN}"));
fileLayout.setOutputPatternAsHeader(false);
fileLayout.start();
logRecorder.register(new FileLog(taskPersistence, fileLayout));

return logRecorder;
}

@Bean
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package ai.starwhale.mlops.agent.task.inferencetask;

import ai.starwhale.mlops.agent.task.inferencetask.executor.TaskExecutor;
import ai.starwhale.mlops.agent.task.log.LogRecorder;
import org.springframework.scheduling.annotation.Scheduled;

public class AgentTaskScheduler {
Expand Down Expand Up @@ -53,13 +54,13 @@ public void uploadResultingTasks() {
this.executor.uploadTaskResults();
}

@Scheduled(fixedDelayString = "${sw.agent.task.schedule.fixedDelay.in.milliseconds:5000}")
@Scheduled(fixedDelayString = "${sw.agent.task.schedule.report.fixedDelay.in.milliseconds:2000}")
public void reportTasks() {
this.executor.reportTasks();
}

@Scheduled(fixedDelayString = "${sw.agent.task.container.log.schedule.fixedDelay.in.milliseconds:5000}")
@Scheduled(fixedDelayString = "${sw.agent.task.schedule.logClean.fixedDelay.in.milliseconds:10000}")
public void logScheduler() {
this.logRecorder.waitQueueScheduler();
this.logRecorder.getRealtimeReader().clean();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,19 @@
* task's execute stage
*/
public enum InferenceStage {
PREPARING, RUNNING, UPLOADING
INIT2PREPARING("init to preparing"),
PREPARING2RUNNING("preparing to running"), PREPARING2CANCELED("preparing yo canceled"),
RUNNING("runtime monitoring"), RUNNING2CANCELED("running to canceled"),
UPLOADING2FINISHED("uploading result file"), UPLOADING2CANCELED("uploading to canceled"),
ARCHIVED("archived");

private final String desc;

InferenceStage(String desc) {
this.desc = desc;
}

public String desc() {
return desc;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@

package ai.starwhale.mlops.agent.task.inferencetask;

import java.util.*;
import java.util.LinkedHashSet;
import java.util.Set;

public class TaskPool {

Expand Down Expand Up @@ -55,13 +56,13 @@ public void fill(InferenceTask task) {
break;
case CANCELING:
switch (task.getStage()) {
case PREPARING:
case PREPARING2CANCELED:
add2PreparingQueue(task);
break;
case RUNNING:
case RUNNING2CANCELED:
runningTasks.add(task);
break;
case UPLOADING:
case UPLOADING2CANCELED:
uploadingTasks.add(task);
break;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,14 @@
* limitations under the License.
*/

package ai.starwhale.mlops.agent.task.inferencetask.action.normal.cancel;
package ai.starwhale.mlops.agent.task.inferencetask.action;

import ai.starwhale.mlops.agent.task.inferencetask.InferenceStage;

import java.util.Optional;

public interface ExecuteStage {
/**
* represent current stage
* @return current stage
*/
default Optional<InferenceStage> stage() {
return Optional.empty();
}
InferenceStage stage();
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,22 +18,18 @@

import ai.starwhale.mlops.agent.container.ContainerClient;
import ai.starwhale.mlops.agent.task.Context;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceStage;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceTask;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceTaskStatus;
import ai.starwhale.mlops.agent.task.inferencetask.LogRecorder;
import ai.starwhale.mlops.agent.task.inferencetask.action.normal.AbsBaseTaskAction;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

@Slf4j
@Service
public class ArchivedAction extends AbsBaseTaskAction {

@Autowired
private LogRecorder logRecorder;

@Override
public InferenceTask processing(InferenceTask originTask, Context context)
throws Exception {
Expand All @@ -57,12 +53,17 @@ public void success(InferenceTask originTask, InferenceTask newTask, Context con
}
// upload agent log to the storage
taskPersistence.uploadLog(originTask);
info(originTask, "task was archived.");
// remove from origin list
taskPool.failedTasks.remove(originTask);
taskPool.succeedTasks.remove(originTask);
taskPool.canceledTasks.remove(originTask);

logRecorder.remove(originTask.getId());

}

@Override
public InferenceStage stage() {
return InferenceStage.ARCHIVED;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,16 @@
import ai.starwhale.mlops.agent.task.inferencetask.InferenceTask;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceTask.ActionStatus;
import ai.starwhale.mlops.agent.task.inferencetask.TaskPool;
import ai.starwhale.mlops.agent.task.inferencetask.action.ExecuteStage;
import ai.starwhale.mlops.agent.task.inferencetask.persistence.FileSystemPath;
import ai.starwhale.mlops.agent.task.inferencetask.persistence.TaskPersistence;
import ai.starwhale.mlops.agent.task.log.LogRecorder;
import cn.hutool.json.JSONUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;

import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.Objects;

@Slf4j
public abstract class AbsBaseTaskAction implements Action<InferenceTask, InferenceTask> {
public abstract class AbsBaseTaskAction implements Action<InferenceTask, InferenceTask>, ExecuteStage {

@Autowired
protected TaskPersistence taskPersistence;
Expand All @@ -55,30 +53,39 @@ public abstract class AbsBaseTaskAction implements Action<InferenceTask, Inferen
@Autowired
protected AgentProperties agentProperties;

@Autowired
protected LogRecorder logRecorder;

private final String logPattern = "task:{}, stage:{}, msg:{}, taskDetail:{}";

@Override
public void pre(InferenceTask task, Context context) {
info(task, String.format("enter %s stage.", stage().desc()));
task.setStage(stage());
task.setActionStatus(ActionStatus.inProgress);
taskPersistence.save(task);
}

// at normal action, the newTask don't use at post
@Override
public void post(InferenceTask originTask, InferenceTask newTask, Context context) {
info(originTask, String.format("exit %s stage.", stage().desc()));
originTask.setActionStatus(ActionStatus.completed);
taskPersistence.save(originTask);
}

protected void recordLog(InferenceTask task, String simpleMsg, Exception e) {
taskPersistence.recordLog(task, simpleMsg + ":" + getStackTrace(e));
@Override
public void fail(InferenceTask originTask, Context context, Exception e) {
log.error("execute task:{}, error:{}", originTask.getId(), e.getMessage());
error(originTask, e.getMessage(), e);
}

private String getStackTrace(Throwable throwable) {
if (Objects.isNull(throwable)) return "";
StringWriter sw = new StringWriter();
protected void info(InferenceTask task, String simpleMsg) {
logRecorder.info(this.getClass().getName(), logPattern, new Object[]{task.getId(), stage().desc(), simpleMsg, JSONUtil.toJsonStr(task)}, task);
}

try (PrintWriter pw = new PrintWriter(sw)) {
throwable.printStackTrace(pw);
return sw.toString();
}
protected void error(InferenceTask task, String simpleMsg, Exception e) {
logRecorder.error(this.getClass().getName(), logPattern, new Object[]{task.getId(), stage().desc(), simpleMsg, JSONUtil.toJsonStr(task)}, e, task);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
import ai.starwhale.mlops.agent.exception.ErrorCode;
import ai.starwhale.mlops.agent.node.SourcePool;
import ai.starwhale.mlops.agent.task.Context;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceStage;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceTask;
import ai.starwhale.mlops.domain.node.Device;
import cn.hutool.core.bean.BeanUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;

Expand Down Expand Up @@ -57,4 +57,9 @@ public void success(InferenceTask originTask, InferenceTask newTask, Context con
// add the new task to the tail
taskPool.add2PreparingQueue(newTask);
}

@Override
public InferenceStage stage() {
return InferenceStage.INIT2PREPARING;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,11 @@

import ai.starwhale.mlops.agent.container.ContainerClient;
import ai.starwhale.mlops.agent.task.Context;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceStage;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceTask;
import ai.starwhale.mlops.agent.task.inferencetask.InferenceTaskStatus;
import ai.starwhale.mlops.agent.task.inferencetask.LogRecorder;
import ai.starwhale.mlops.agent.task.inferencetask.persistence.TaskPersistence.ExecuteStatus;
import cn.hutool.core.bean.BeanUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import java.util.Optional;
Expand All @@ -33,8 +31,6 @@
@Service
public class MonitoringAction extends AbsBaseTaskAction {

@Autowired
private LogRecorder logRecorder;

@Override
public InferenceTask processing(InferenceTask runningTask, Context context)
Expand Down Expand Up @@ -76,6 +72,8 @@ public void success(InferenceTask originTask, InferenceTask newTask, Context con
sourcePool.release(newTask.getDevices());
// only update memory list,there is no need to update the disk file(already update by taskContainer)
taskPool.runningTasks.remove(originTask);
// log error
error(originTask, "task execution failed", null);
} else {
// try to detect container status
ContainerClient.ContainerStatus status = containerClient.status(newTask.getContainerId());
Expand All @@ -89,8 +87,7 @@ public void success(InferenceTask originTask, InferenceTask newTask, Context con
log.error("task:{} maximum number of restart retries:{} has been reached, task failed",
originTask.getId(), agentProperties.getTask().getRetryRestartMaxNum());

recordLog(originTask,
String.format("stage:running, task:%s container is dead, maximum number of restart retries num has been reached, task failed", originTask.getId()), null);
error(originTask, String.format("container:%s is dead, maximum number of restart retries num has been reached, task failed!", originTask.getContainerId()), null);

sourcePool.release(newTask.getDevices());
newTask.setStatus(InferenceTaskStatus.FAIL);
Expand All @@ -99,12 +96,11 @@ public void success(InferenceTask originTask, InferenceTask newTask, Context con
} else {
log.warn("container:{} is dead, now will restart it", originTask.getContainerId());

recordLog(originTask,
String.format("stage:running, task:%s container:%s is dead, now will restart it", originTask.getId(), originTask.getContainerId()), null);
error(originTask, String.format("container:%s is dead, now will restart it.", originTask.getContainerId()), null);

originTask.retryRestart();
// this invokes must before restart
logRecorder.restart(originTask.getId(), originTask.getContainerId());
// logRecorder.restart(originTask.getId(), originTask.getContainerId());

containerClient.startContainer(originTask.getContainerId());

Expand All @@ -114,8 +110,8 @@ public void success(InferenceTask originTask, InferenceTask newTask, Context con
// already be removed or any else error
log.error("container:{} may be removed, now will return error", newTask.getContainerId());

recordLog(originTask,
String.format("stage:running, task:%s container:%s not found, may be removed, now will return error", originTask.getId(), originTask.getContainerId()), null);
error(originTask,
String.format("container:%s not found, may be removed, now will return error", originTask.getContainerId()), null);

newTask.setStatus(InferenceTaskStatus.FAIL);
taskPool.failedTasks.add(newTask);
Expand All @@ -127,4 +123,9 @@ public void success(InferenceTask originTask, InferenceTask newTask, Context con
}
}
}

@Override
public InferenceStage stage() {
return InferenceStage.RUNNING;
}
}
Loading