-
Notifications
You must be signed in to change notification settings - Fork 9.1k
YARN-11823: add new endpoints for getting jstacks of application and nodes #7726
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: trunk
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
/** * Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.hadoop.yarn.server.nodemanager; | ||
|
||
import org.apache.hadoop.util.Shell; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import java.io.*; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.Files; | ||
import java.nio.file.StandardCopyOption; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
public class DiagnosticJStackService { | ||
|
||
private static final Logger LOG = LoggerFactory | ||
.getLogger(DiagnosticJStackService.class); | ||
private static final String PYTHON_COMMAND = "python3"; | ||
private static String scriptLocation = null; | ||
|
||
static { | ||
try { | ||
// Extract script from JAR to a temp file | ||
InputStream in = DiagnosticJStackService.class.getClassLoader() | ||
.getResourceAsStream("diagnostics/jstack_collector.py"); | ||
File tempScript = File.createTempFile("jstack_collector", ".py"); | ||
Files.copy(in, tempScript.toPath(), StandardCopyOption.REPLACE_EXISTING); | ||
tempScript.setExecutable(true); // Set execute permission | ||
scriptLocation = tempScript.getAbsolutePath(); | ||
} catch (IOException e) { | ||
LOG.error("Failed to extract Python script from JAR", e); | ||
} | ||
} | ||
|
||
public static String collectNodeJStack() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. First i read NodeJS, can we use other name here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I am thinking of changing to |
||
throws Exception { | ||
if (Shell.WINDOWS) { | ||
throw new UnsupportedOperationException("Not implemented for Windows"); | ||
} | ||
|
||
ProcessBuilder pb = createProcessBuilder(); | ||
|
||
return executeCommand(pb); | ||
|
||
} | ||
|
||
|
||
|
||
public static String collectAppJStack(String appId) | ||
throws Exception { | ||
if (Shell.WINDOWS) { | ||
throw new UnsupportedOperationException("Not implemented for Windows."); | ||
} | ||
ProcessBuilder pb = createProcessBuilder(appId); | ||
|
||
LOG.info("Diagnostic process environment: {}", pb.environment()); | ||
|
||
return executeCommand(pb); | ||
} | ||
|
||
protected static ProcessBuilder createProcessBuilder() { | ||
List<String> commandList = | ||
new ArrayList<>(Arrays.asList(PYTHON_COMMAND, scriptLocation)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why we need ArrayList? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because the
|
||
|
||
return new ProcessBuilder(commandList); | ||
} | ||
|
||
|
||
protected static ProcessBuilder createProcessBuilder(String appId) { | ||
List<String> commandList = | ||
new ArrayList<>(Arrays.asList(PYTHON_COMMAND, scriptLocation, appId)); | ||
|
||
return new ProcessBuilder(commandList); | ||
} | ||
|
||
private static String executeCommand(ProcessBuilder pb) | ||
throws Exception { | ||
Process process = pb.start(); | ||
int exitCode; | ||
StringBuilder outputBuilder = new StringBuilder(); | ||
StringBuilder errorBuilder = new StringBuilder(); | ||
|
||
try ( | ||
BufferedReader stdoutReader = new BufferedReader(new InputStreamReader(process.getInputStream(), | ||
StandardCharsets.UTF_8)); | ||
BufferedReader stderrReader = new BufferedReader(new InputStreamReader(process.getErrorStream(), | ||
StandardCharsets.UTF_8)); | ||
) { | ||
|
||
String line; | ||
while ((line = stdoutReader.readLine()) != null) { | ||
outputBuilder.append(line).append("\n"); | ||
} | ||
|
||
while ((line = stderrReader.readLine()) != null) { | ||
errorBuilder.append(line).append("\n"); | ||
} | ||
if (!errorBuilder.toString().isEmpty()) { | ||
LOG.error("Python script stderr: {}", errorBuilder); | ||
} | ||
|
||
process.waitFor(); | ||
} catch (Exception e) { | ||
LOG.error("Error getting JStack: {}", pb.command()); | ||
throw e; | ||
} | ||
exitCode = process.exitValue(); | ||
if (exitCode != 0) { | ||
throw new IOException("The JStack collector script exited with non-zero " + | ||
"exit code: " + exitCode); | ||
} | ||
|
||
return outputBuilder.toString(); | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,12 +31,14 @@ | |
import java.util.Set; | ||
|
||
import org.apache.hadoop.io.IOUtils; | ||
import org.apache.hadoop.yarn.server.nodemanager.DiagnosticJStackService; | ||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.records.AuxServiceRecord; | ||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.records.AuxServiceRecords; | ||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePlugin; | ||
import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager; | ||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.AuxiliaryServicesInfo; | ||
import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo; | ||
import org.apache.hadoop.yarn.webapp.WebAppException; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
|
@@ -271,6 +273,35 @@ public ContainerInfo getNodeContainer(@javax.ws.rs.core.Context | |
|
||
} | ||
|
||
@GET | ||
@Path("/jstack") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this can be a bit misleading name cause we already have a /stacks API for jstack There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also how those it different from /stacks ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Stack and JStack are totally different from each other. JStack is used on current running Java process to see what each thread are actually doing while Stack is just a list of active methods that have been called. Here is an example of JStack:
Here is an example Stack:
|
||
@Produces({MediaType.TEXT_PLAIN}) | ||
public Response getNodeJStack() { | ||
try { | ||
return Response.status(Status.OK) | ||
.entity(DiagnosticJStackService.collectNodeJStack()) // Make sure the NodeManager have python3 install | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What will happen if py3 is not present? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is quite ambiguous when the python3 is not installed. The exception will only be shown when I execute the script manually. If I try to access the endpoint at RM without the |
||
.build(); | ||
} catch (Exception e) { | ||
throw new WebAppException("Error collection NodeManager JStack: " + e.getMessage() + ". " + | ||
"For more information please check the NodeManager logs."); | ||
} | ||
} | ||
|
||
|
||
@GET | ||
@Path("/apps/{appid}/jstack") | ||
@Produces({MediaType.TEXT_PLAIN}) | ||
public Response getApplicationJStack(@PathParam("appid") String appId) { | ||
try { | ||
return Response.status(Status.OK) | ||
.entity(DiagnosticJStackService.collectAppJStack(appId)) // Make sure the NodeManager have python3 install | ||
.build(); | ||
} catch (Exception e) { | ||
throw new WebAppException("Error collecting Application JStack: " + e.getMessage() + ". " + | ||
"For more information please check the NodeManager logs."); | ||
} | ||
} | ||
|
||
/** | ||
* Returns log file's name as well as current file size for a container. | ||
* | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
# Licensed to the Apache Software Foundation (ASF) under one | ||
Check failure on line 1 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
# or more contributor license agreements. See the NOTICE file | ||
# distributed with this work for additional information | ||
# regarding copyright ownership. The ASF licenses this file | ||
# to you under the Apache License, Version 2.0 (the | ||
# "License"); you may not use this file except in compliance | ||
# with the License. You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import subprocess | ||
import sys | ||
|
||
NUMBER_OF_JSTACK = 3 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be path throw REST There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea, it will be nice to make that number configurable from the the RESTAPI. |
||
|
||
def get_nodemanager_pid(): | ||
Check failure on line 22 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I beleive from security perspective, these should not be available in REST API in case of not secure cluster, and we should do authorisation in secured clusters. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmmmm....why is that? The script will only get java processes of the active container and execute JStack command on it, not that user could modify the script or do some malicious activities? |
||
results = run_command("ps aux | grep nodemanager | grep -v grep") | ||
# ps aux | grep nodemanager | grep -v grep | ||
# root 414 1.3 1.7 8124480 434520 ? Sl 11:36 0:52 /usr/lib/jvm/java-8-openjdk//bin/java -Dproc_nodemanager -Djava.net.preferIPv4Stack=true -Dyarn.log.dir=/opt/hadoop/logs -Dyarn.log.file=hadoop.log -Dyarn.home.dir=/opt/hadoop -Dyarn.root.logger=INFO,console -Dhadoop.log.dir=/opt/hadoop/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/opt/hadoop -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,console -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.security.logger=INFO,NullAppender -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.math=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.text=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.zip=ALL-UNNAMED --add-opens=java.base/sun.security.util=ALL-UNNAMED --add-opens=java.base/sun.security.x509=ALL-UNNAMED org.apache.hadoop.yarn.server.nodemanager.NodeManager | ||
Check failure on line 25 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
pids = [] # Some host may contain more than one NodeManager | ||
for result in results.strip().splitlines(): | ||
pid = result.split()[1] | ||
pids.append(pid) | ||
|
||
return pids | ||
|
||
|
||
def get_app_pid(app_id): | ||
Check failure on line 34 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
|
||
# results= ''' | ||
# root 413 1.7 2.0 8355580 512972 ? Sl 11:21 2:56 /usr/lib/jvm/java-8-openjdk//bin/java -Dproc_nodemanager -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/opt/hadoop/logs -Dhadoop.log.file=NODEMANAGER.log -Dyarn.log.dir=/opt/hadoop/logs -Dyarn.log.file=NODEMANAGER.log -Dyarn.home.dir=/opt/hadoop -Dyarn.root.logger=INFO,DRFA -Dhadoop.home.dir=/opt/hadoop -Dhadoop.id.str=root -Dhadoop.root.logger=INFO,DRFA -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.security.logger=INFO,NullAppender -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.math=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.text=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.zip=ALL-UNNAMED --add-opens=java.base/sun.security.util=ALL-UNNAMED --add-opens=java.base/sun.security.x509=ALL-UNNAMED --enable-native-access=ALL-UNNAMED org.apache.hadoop.yarn.server.nodemanager.NodeManager | ||
Check failure on line 37 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
# root 41611 4.1 1.9 2414568 470660 ? Sl 14:08 0:16 /usr/lib/jvm/java-8-openjdk//bin/java -Xmx750m org.apache.hadoop.yarn.applications.distributedshell.ApplicationMaster --container_type GUARANTEED --container_memory 750 --container_vcores 1 --num_containers 500 --priority 0 --appname DistributedShell --homedir hdfs://namenode:9000/user/root | ||
Check failure on line 38 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
# ''' | ||
results = run_command("ps aux | grep jvm/java | grep -v -e /bin/bash -e grep") # TODO: later include "grep app_id" for long java application like mapreduce | ||
Check failure on line 40 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
pids = [] | ||
for result in results.strip().splitlines(): | ||
pid = result.split()[1] | ||
pids.append(pid) | ||
|
||
return pids | ||
|
||
|
||
def execute_jstack(pids): | ||
Check failure on line 49 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
all_jstacks = [] | ||
|
||
for pid in pids: | ||
for i in range(NUMBER_OF_JSTACK): # Get multiple jstack | ||
jstack_output = run_command("jstack", pid) | ||
all_jstacks.append("--- JStack iteration-{} for PID: {} ---\n{}".format(i, pid, jstack_output)) | ||
Check failure on line 55 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
|
||
return "\n".join(all_jstacks) | ||
|
||
|
||
def run_command(*argv): | ||
Check failure on line 60 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
try: | ||
cmd = " ".join(arg for arg in argv) | ||
print("Running command with arguments:", cmd) | ||
response = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, check=True) | ||
Check failure on line 64 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
response_str = response.stdout.decode('utf-8') | ||
except subprocess.CalledProcessError as e: | ||
Check failure on line 66 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
response_str = "Unable to run command: {}".format(e) | ||
print(response_str, file=sys.stderr) | ||
except Exception as e: | ||
Check failure on line 69 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
response_str = "Exception occurred: {}".format(e) | ||
print(response_str, file=sys.stderr) | ||
|
||
return response_str | ||
|
||
|
||
def main(): | ||
Check failure on line 76 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
||
|
||
# app_id = "application_1748517687882_0013" | ||
if len(sys.argv) > 1: | ||
app_id = sys.argv[1] | ||
pids = get_app_pid(app_id) | ||
else: | ||
pids = get_nodemanager_pid() | ||
|
||
if not pids: | ||
print("No active process id in this NodeManager.") | ||
sys.exit(0) | ||
|
||
jstacks = execute_jstack(pids) | ||
print(jstacks) # The Initiated java processBuilder will read this stdout | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
|
||
Check failure on line 95 in hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/resources/diagnostics/jstack_collector.py
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This static block will block the NM to start up, till it is not done
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
According to my testing, it is very fast when I access the JStack endpoint. Do you happen to have a better idea of getting the script file from /resources folder?