Skip to content

Commit

Permalink
[Feature][Flink] Add built -in Flink History Server to reduce the sta…
Browse files Browse the repository at this point in the history
…te of the Unknown, the final information of the Flink task is more accurate (#3780)

Signed-off-by: Zzm0809 <[email protected]>
Co-authored-by: zackyoungh <[email protected]>
Co-authored-by: Zzm0809 <[email protected]>
  • Loading branch information
3 people authored Sep 6, 2024
1 parent 732561f commit 89a1bae
Show file tree
Hide file tree
Showing 25 changed files with 641 additions and 40 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,15 @@
import org.dinky.data.constant.DirConstant;
import org.dinky.data.exception.BusException;
import org.dinky.data.model.FlinkUdfManifest;
import org.dinky.data.model.ResourcesModelEnum;
import org.dinky.data.model.SystemConfiguration;
import org.dinky.data.result.Result;
import org.dinky.function.constant.PathConstant;
import org.dinky.function.util.ZipWriter;
import org.dinky.resource.BaseResourceManager;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.List;
Expand All @@ -37,9 +41,13 @@

import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;

import cn.dev33.satoken.annotation.SaIgnore;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.convert.Convert;
import cn.hutool.core.io.FileUtil;
Expand All @@ -58,7 +66,7 @@
@Api(tags = "UDF & App Jar Controller")
@RequestMapping("/download")
public class DownloadController {

// todo: Controller has injection risk
@GetMapping("downloadDepJar/{taskId}")
@ApiOperation("Download UDF Jar")
public void downloadJavaUDF(@PathVariable Integer taskId, HttpServletResponse resp) {
Expand Down Expand Up @@ -100,9 +108,9 @@ public void downloadJavaUDF(@PathVariable Integer taskId, HttpServletResponse re
}

/**
* 提供docker通过http下载dinky-app.jar
* Provide Docker to download dinky-app.jar via HTTP request
*
* @param version 版本
* @param version version of dinky-app.jar
* @param resp resp
*/
@GetMapping("downloadAppJar/{version}")
Expand All @@ -117,8 +125,33 @@ public void downloadAppJar(@PathVariable String version, HttpServletResponse res

@GetMapping("downloadFromRs")
@ApiOperation("Download From Resource")
public void downloadJavaUDF(String path, HttpServletResponse resp) {
@SaIgnore
public void downloadFromRs(String path, HttpServletResponse resp) {
InputStream inputStream = BaseResourceManager.getInstance().readFile(path);
ServletUtil.write(resp, inputStream);
}

// todo: There is a risk of injection in this interface
@PostMapping("uploadFromRsByLocal")
@ApiOperation("Upload From Resource By Local")
@SaIgnore
public Result<Void> uploadFromRs(String path, @RequestParam("file") MultipartFile file) {
SystemConfiguration systemConfiguration = SystemConfiguration.getInstances();
if (!systemConfiguration.getResourcesEnable().getValue()
|| !systemConfiguration.getResourcesModel().getValue().equals(ResourcesModelEnum.LOCAL)) {
return Result.failed("resources model is not local or resources is not enable");
}

try {
File dest = new File(path);
if (!dest.getParentFile().exists()) {
dest.getParentFile().mkdirs();
}
file.transferTo(dest);
return Result.succeed();
} catch (IOException e) {
log.error("upload file failed", e);
throw new BusException("upload file failed");
}
}
}
137 changes: 137 additions & 0 deletions dinky-admin/src/main/java/org/dinky/init/FlinkHistoryServer.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
/*
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

package org.dinky.init;

import org.dinky.data.model.ResourcesModelEnum;
import org.dinky.data.model.S3Configuration;
import org.dinky.data.model.SystemConfiguration;
import org.dinky.data.properties.OssProperties;
import org.dinky.service.JobInstanceService;
import org.dinky.service.SysConfigService;

import org.apache.flink.runtime.webmonitor.history.HistoryServerUtil;

import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;

import org.springframework.boot.ApplicationArguments;
import org.springframework.boot.ApplicationRunner;
import org.springframework.core.annotation.Order;
import org.springframework.stereotype.Component;

import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.net.NetUtil;
import lombok.extern.slf4j.Slf4j;

@Component
@Slf4j
@Order(value = 2)
public class FlinkHistoryServer implements ApplicationRunner {
public static final Set<String> HISTORY_JOBID_SET = new LinkedHashSet<>();
private final ThreadPoolExecutor threadPoolExecutor = new ThreadPoolExecutor(
5, 20, 10, TimeUnit.SECONDS, new LinkedBlockingQueue<>(100), new ThreadPoolExecutor.DiscardOldestPolicy());

private final Runnable historyRunnable;
private final SystemConfiguration systemConfiguration = SystemConfiguration.getInstances();
private final SysConfigService sysConfigService;

public FlinkHistoryServer(JobInstanceService jobInstanceService, SysConfigService sysConfigService) {
this.sysConfigService = sysConfigService;
this.historyRunnable = () -> {
Map<String, String> flinkHistoryServerConfiguration =
SystemConfiguration.getInstances().getFlinkHistoryServerConfiguration();
if (systemConfiguration.getResourcesEnable().getValue()) {
if (systemConfiguration.getResourcesModel().getValue().equals(ResourcesModelEnum.OSS)) {
OssProperties ossProperties = systemConfiguration.getOssProperties();
flinkHistoryServerConfiguration.put(S3Configuration.ENDPOINT, ossProperties.getEndpoint());
flinkHistoryServerConfiguration.put(S3Configuration.ACCESS_KEY, ossProperties.getAccessKey());
flinkHistoryServerConfiguration.put(S3Configuration.SECRET_KEY, ossProperties.getSecretKey());
flinkHistoryServerConfiguration.put(
S3Configuration.PATH_STYLE_ACCESS, String.valueOf(ossProperties.getPathStyleAccess()));
}
}

HistoryServerUtil.run(
(jobId) -> {
HISTORY_JOBID_SET.add(jobId);
threadPoolExecutor.execute(() -> {
jobInstanceService.hookJobDoneByHistory(jobId);
});
},
flinkHistoryServerConfiguration);
};
}

@Override
public void run(ApplicationArguments args) throws Exception {
AtomicReference<Thread> historyThread = new AtomicReference<>(new Thread(historyRunnable));
Runnable closeHistory = () -> {
if (historyThread.get().isAlive()) {
historyThread.get().interrupt();
HISTORY_JOBID_SET.clear();
}
};

// Check if the port is available
Consumer<Integer> checkAndUpdatePort = (port) -> {
if (!NetUtil.isUsableLocalPort(port)) {
int usableLocalPort = NetUtil.getUsableLocalPort(8000);
sysConfigService.updateSysConfigByKv(
systemConfiguration.getFlinkHistoryServerPort().getKey(), String.valueOf(usableLocalPort));
}
};
systemConfiguration.getFlinkHistoryServerPort().addChangeEvent(checkAndUpdatePort);
systemConfiguration.getFlinkHistoryServerPort().addParameterCheck(checkAndUpdatePort);
CollUtil.newArrayList(
systemConfiguration.getUseFlinkHistoryServer(),
systemConfiguration.getFlinkHistoryServerPort(),
systemConfiguration.getFlinkHistoryServerArchiveRefreshInterval())
.forEach(x -> x.addChangeEvent(d -> {
if (systemConfiguration.getUseFlinkHistoryServer().getValue()) {
closeHistory.run();
checkAndUpdatePort.accept(
systemConfiguration.getFlinkHistoryServerPort().getValue());
historyThread
.updateAndGet((t) -> new Thread(historyRunnable))
.start();

} else {
closeHistory.run();
}
}));
if (systemConfiguration.getUseFlinkHistoryServer().getValue()) {
checkAndUpdatePort.accept(
systemConfiguration.getFlinkHistoryServerPort().getValue());
try {
if (!historyThread.get().isAlive()) {
historyThread.get().start();
}
} catch (Exception e) {
log.error("Flink history server start failed: ", e);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

package org.dinky.job.handler;

import static org.dinky.utils.JsonUtils.objectMapper;

import org.dinky.api.FlinkAPI;
import org.dinky.assertion.Asserts;
import org.dinky.cluster.FlinkClusterInfo;
Expand All @@ -36,12 +38,14 @@
import org.dinky.data.flink.job.FlinkJobDetailInfo;
import org.dinky.data.flink.watermark.FlinkJobNodeWaterMark;
import org.dinky.data.model.ClusterInstance;
import org.dinky.data.model.SystemConfiguration;
import org.dinky.data.model.ext.JobInfoDetail;
import org.dinky.data.model.job.JobInstance;
import org.dinky.gateway.Gateway;
import org.dinky.gateway.config.GatewayConfig;
import org.dinky.gateway.exception.NotSupportGetStatusException;
import org.dinky.gateway.model.FlinkClusterConfig;
import org.dinky.init.FlinkHistoryServer;
import org.dinky.job.JobConfig;
import org.dinky.service.ClusterInstanceService;
import org.dinky.service.HistoryService;
Expand All @@ -52,13 +56,16 @@

import java.time.Duration;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;

import org.springframework.context.annotation.DependsOn;
import org.springframework.stereotype.Component;

import com.alibaba.fastjson2.JSON;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.type.CollectionType;

import cn.hutool.core.bean.BeanUtil;
import cn.hutool.core.bean.copier.CopyOptions;
Expand Down Expand Up @@ -223,6 +230,13 @@ public static boolean refreshJob(JobInfoDetail jobInfoDetail, boolean needSave)
* @return {@link org.dinky.data.dto.JobDataDto}.
*/
public static JobDataDto getJobData(Integer id, String jobManagerHost, String jobId) {
if (FlinkHistoryServer.HISTORY_JOBID_SET.contains(jobId)
&& SystemConfiguration.getInstances().getUseFlinkHistoryServer().getValue()) {
jobManagerHost = "127.0.0.1:"
+ SystemConfiguration.getInstances()
.getFlinkHistoryServerPort()
.getValue();
}
JobDataDto.JobDataDtoBuilder builder = JobDataDto.builder();
FlinkAPI api = FlinkAPI.build(jobManagerHost);
try {
Expand All @@ -240,8 +254,15 @@ public static JobDataDto getJobData(Integer id, String jobManagerHost, String jo
api.getVertices(jobId).forEach(vertex -> {
flinkJobDetailInfo.getPlan().getNodes().forEach(planNode -> {
if (planNode.getId().equals(vertex)) {
planNode.setWatermark(
JsonUtils.toList(api.getWatermark(jobId, vertex), FlinkJobNodeWaterMark.class));
try {
CollectionType listType = objectMapper
.getTypeFactory()
.constructCollectionType(ArrayList.class, FlinkJobNodeWaterMark.class);
List<FlinkJobNodeWaterMark> watermark =
objectMapper.readValue(api.getWatermark(jobId, vertex), listType);
planNode.setWatermark(watermark);
} catch (Exception ignored) {
}
planNode.setBackpressure(JsonUtils.toJavaBean(
api.getBackPressure(jobId, vertex), FlinkJobNodeBackPressure.class));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ public interface JobInstanceService extends ISuperService<JobInstance> {
*/
boolean hookJobDone(String jobId, Integer taskId);

boolean hookJobDoneByHistory(String jobId);

/**
* Refresh the job instances for the given task IDs.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

import cn.hutool.core.util.StrUtil;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;

Expand Down Expand Up @@ -245,6 +246,35 @@ public boolean hookJobDone(String jobId, Integer taskId) {
return isDone;
}

@Override
public boolean hookJobDoneByHistory(String jobId) {
LambdaQueryWrapper<JobInstance> queryWrapper = new LambdaQueryWrapper<>();
queryWrapper
.eq(JobInstance::getJid, jobId)
.orderByDesc(JobInstance::getCreateTime)
.last("limit 1");
JobInstance instance = baseMapper.selectOne(queryWrapper);

if (instance == null
|| !StrUtil.equalsAny(
instance.getStatus(), JobStatus.RECONNECTING.getValue(), JobStatus.UNKNOWN.getValue())) {
// Not having a corresponding jobinstance means that this may not have succeeded in running,
// returning true to prevent retry.
return true;
}

DaemonTaskConfig config = DaemonTaskConfig.build(FlinkJobTask.TYPE, instance.getId());
DaemonTask daemonTask = FlinkJobThreadPool.getInstance().removeByTaskConfig(config);
daemonTask = Optional.ofNullable(daemonTask).orElse(DaemonTask.build(config));

boolean isDone = daemonTask.dealTask();
// If the task is not completed, it is re-queued
if (!isDone) {
FlinkJobThreadPool.getInstance().execute(daemonTask);
}
return isDone;
}

@Override
public void refreshJobByTaskIds(Integer... taskIds) {
for (Integer taskId : taskIds) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

package org.dinky.service.impl;

import static org.dinky.data.model.SystemConfiguration.FLINK_JOB_ARCHIVE;

import org.dinky.assertion.Asserts;
import org.dinky.assertion.DinkyAssert;
import org.dinky.config.Dialect;
Expand Down Expand Up @@ -233,6 +235,14 @@ public JobConfig buildJobSubmitConfig(TaskDTO task) {
// When disabling checkpoints, delete the checkpoint path
config.setSavePointPath(null);
}
if (SystemConfiguration.getInstances().getUseFlinkHistoryServer().getValue()) {
config.getConfigJson().compute("jobmanager.archive.fs.dir", (k, v) -> {
if (StringUtils.isNotBlank(v)) {
return v + "," + FLINK_JOB_ARCHIVE;
}
return FLINK_JOB_ARCHIVE;
});
}
if (GatewayType.get(task.getType()).isDeployCluster()) {
log.info("Init gateway config, type:{}", task.getType());
FlinkClusterConfig flinkClusterCfg =
Expand Down
Loading

0 comments on commit 89a1bae

Please sign in to comment.