Skip to content

Latest commit

 

History

History
372 lines (339 loc) · 20.8 KB

troubleshooting.org

File metadata and controls

372 lines (339 loc) · 20.8 KB

These nodes have high swap and/or crashed

(loop for node in '("n17")
      append
      (with-temp-buffer
	(insert (shell-command-to-string (format "pbsnodes %s" node)))

	;; now find jobs, and replace them with links.
	(goto-char (point-min))
	(let ((found-jobs '()))
	  (while (re-search-forward "[0-9]*/\\([0-9]*.gilgamesh.cheme.cmu.edu\\)" nil t)
	    (add-to-list 'found-jobs (match-string 1))
	    (setf (buffer-substring (match-beginning 0) (match-end 0))
		  (format "[[job:%s][%s]]" (match-string 1) (match-string 0)))
	    ;; go to end of link we just inserted to avoid finding this link again.
	    (re-search-forward "]]"))
	  (loop for job in found-jobs
		collect (append (list node)
				      (split-string
				       (nth 2 (split-string (shell-command-to-string
							     (format "qstat %s" job))
							     "\n"))))))))

Getting memory usage of jobs

qstat -f -x
<Data>
  <Job>
    <Job_Id>1429368.gilgamesh.cheme.cmu.edu</Job_Id>
    <Job_Name>/home-research/chenw3/research/molecules/strcuture=mono211/vacancy=O8</Job_Name>
    <Job_Owner>[email protected]</Job_Owner>
    <resources_used>
      <cput>169:47:59</cput>
      <mem>836960kb</mem>
      <vmem>1818604kb</vmem>
      <walltime>31:02:52</walltime>
    </resources_used>
    <job_state>R</job_state>
    <queue>long</queue>
    <server>gilgamesh.cheme.cmu.edu</server>
    <Checkpoint>u</Checkpoint>
    <ctime>1469493835</ctime>
    <Error_Path>gilgamesh.cheme.cmu.edu:/home-research/chenw3/research/molecules/strcuture=mono211/vacancy=O8/vacancy=O8.e1429368</Error_Path>
    <exec_host>n27/26</exec_host>
    <Hold_Types>n</Hold_Types>
    <Join_Path>oe</Join_Path>
    <Keep_Files>n</Keep_Files>
    <Mail_Points>a</Mail_Points>
    <mtime>1469493836</mtime>
    <Output_Path>gilgamesh.cheme.cmu.edu:/home-research/chenw3/research/molecules/strcuture=mono211/vacancy=O8/vacancy=O8.o1429368</Output_Path>
    <Priority>0</Priority><qtime>1469493835</qtime>
    <Rerunable>True</Rerunable>
    <Resource_List>
      <mem>2gb</mem>
      <neednodes>1:ppn=1</neednodes>
      <nice>15</nice>
      <nodect>1</nodect>
      <nodes>1:ppn=1</nodes>
      <walltime>168:00:00</walltime>
    </Resource_List>
    <session_id>67857</session_id>
    <substate>42</substate>
    <Variable_List>PBS_O_HOME=/home-research/chenw3,PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=chenw3,PBS_O_PATH=/home-research/chenw3/python/ase/tools:/home-research/chenw3/python/jasp/jasp/bin:/opt/kitchingroup/vasp-5.3.5/vtstscripts-914:/home-research/jkitchin/bin:/opt/vtk/bin:/opt/kitchingroup/vasp-5.3.5/vaspy/bin:/opt/kitchingroup/vasp-5.3.5/ase-s16/tools:/opt/kitchingroup/vasp-5.3.5/jasp-s16/jasp/bin:/opt/kitchingroup/vasp-5.3.5/bin:/usr/mpi/intel/openmpi-1.4-qlc/bin:/opt/kitchingroup/CANOPY/Canopy_64bit/User/bin:/opt/maui/bin:/opt/fav/bin:/usr/local/texlive/2012/bin/x86_64-linux:/opt/intel/Compiler/11.1/072/bin/intel64:/opt/intel/impi/4.0.0.028/intel64/bin:/usr/lib64/qt-3.3/bin:/usr/kerberos/bin:/usr/local/bin:/bin:/usr/bin:/usr/share/pvm3/lib:/home-research/chenw3/bin,PBS_O_MAIL=/var/spool/mail/chenw3,PBS_O_SHELL=/bin/bash,PBS_O_HOST=gilgamesh.cheme.cmu.edu,PBS_SERVER=gilgamesh.cheme.cmu.edu,PBS_O_WORKDIR=/home-research/chenw3/research/molecules/strcuture=mono211/vacancy=O8,PBS_O_QUEUE=q_feed</Variable_List>

    <euser>chenw3</euser>
    <egroup>kitchingroup</egroup>
    <hashname>1429368.gilgamesh.cheme.cmu.edu</hashname>
    <queue_rank>623167</queue_rank>
    <queue_type>E</queue_type>
    <etime>1469493835</etime>
    <submit_args>-joe -N /home-research/chenw3/research/molecules/strcuture=mono211/vacancy=O8 -l walltime=168:00:00 -l nodes=1:ppn=1 -l mem=2GB</submit_args>
    <start_time>1469493836</start_time>
    <start_count>1</start_count>
  </Job>
</Data>
(defun job-memory-used (jobid)
  (let* ((xml (with-temp-buffer
		(insert (shell-command-to-string (format "qstat -f -x %s" jobid)))
		(car (xml-parse-region (point-min) (point-max)))))
	 (job (car (xml-get-children xml 'Job)))
	 (resources (car (xml-get-children job 'resources_used)))
	 (mem (car (xml-get-children resources 'mem))))
	 (/ (string-to-number (nth 2 mem)) 1024.0 1024)))

(append '(("node" "mem (GB)" "jobid" "name" "user" "time" "state" "queue"))
'(hline)
	(loop for node in (mapcar (lambda (n) (format "n%s" n)) (number-sequence 1 30))
	      append
	      (with-temp-buffer
		(insert (shell-command-to-string (format "pbsnodes %s" node)))

		;; now find jobs, and replace them with links.
		(goto-char (point-min))
		(let ((found-jobs '()))
		  (while (re-search-forward "[0-9]*/\\([0-9]*.gilgamesh.cheme.cmu.edu\\)" nil t)
		    (add-to-list 'found-jobs (match-string 1))
		    (setf (buffer-substring (match-beginning 0) (match-end 0))
			  (format "[[job:%s][%s]]" (match-string 1) (match-string 0)))
		    ;; go to end of link we just inserted to avoid finding this link again.
		    (re-search-forward "]]"))
		  (loop for job in found-jobs
			collect (append (list node) (list (format "%1.2f" (job-memory-used job)))
					(split-string
					 (nth 2 (split-string (shell-command-to-string
							       (format "qstat %s" job))
							      "\n")))))))))
nodemem (GB)jobidnameusertimestatequeue
n2520.541466777.gilgamesh…/db14/11-11chenw3401:46:3Rlong
n2620.541466776.gilgamesh…/db14/10-10chenw3422:22:1Rlong
n2620.541466775.gilgamesh…/db14/9-9chenw3420:40:5Rlong
n719.631471426.gilgamesh…/db10/20-20/tianyug1273:06:0Rlong
n219.511471428.gilgamesh…/db10/25-25/tianyug1302:17:4Rlong
n1911.201467689.gilgamesh…p+0.035_8.4_0fgeng130:57:5Rlong
n2611.041471464.gilgamesh…/p+0.02_9_0fgeng42:21:37Rlong
n2910.931471463.gilgamesh…/p+0.02_9_0fgeng38:58:03Rlong
n2910.631473125.gilgamesh…/p+0.02_8.4_0fgeng33:23:03Rlong
n2910.451473119.gilgamesh…/p+0.02_8.4_1fgeng32:33:40Rlong
n39.871474577.gilgamesh…d76edc9f25a6ejhaddad16:56:01Rshort
n29.511474578.gilgamesh…20722a1841238jhaddad16:56:01Rshort
n299.451471434.gilgamesh…p+0.035_8.4_4fgeng41:27:04Rlong
n68.471474571.gilgamesh…34278757d5ffcjhaddad17:29:02Rshort
n63.461474572.gilgamesh…ad165eb84c0afjhaddad17:29:01Rshort
n263.461474592.gilgamesh…234eaa949c017jhaddad16:29:26Rshort
n253.331474593.gilgamesh…0a13cd1ebc497jhaddad16:30:00Rshort
n53.301474573.gilgamesh…a76cdea8fb461jhaddad17:29:41Rshort
n253.211474594.gilgamesh…32f2449f643a0jhaddad16:30:00Rshort
n263.211474589.gilgamesh…becee9d712702jhaddad16:29:26Rshort
n53.191474574.gilgamesh…9baf812609fcdjhaddad16:56:52Rshort
n253.101474595.gilgamesh…d21aadc0c2a3ajhaddad16:30:00Rshort
n263.091474590.gilgamesh…2231ec7a5994ajhaddad16:29:26Rshort
n53.081474575.gilgamesh…edea8f0c0a824jhaddad16:56:07Rshort
n32.991474576.gilgamesh…ad7faa5db3305jhaddad16:56:01Rshort
n252.991474596.gilgamesh…0318d6a294487jhaddad16:29:59Rshort
n262.991474591.gilgamesh…a55376086af50jhaddad16:29:26Rshort
n20.061474601.gilgamesh…/0-4-nebtianyug100:00:00Rlong
n30.061474600.gilgamesh…/s-s-nebtianyug100:00:00Rlong
n30.061474599.gilgamesh…/s-t-nebtianyug100:00:00Rlong
n30.061474598.gilgamesh…/s-s-nebtianyug100:00:00Rlong
n30.061470327.gilgamesh…580eaf5b18d7bjboes00:00:00Rshort
n40.061474700.gilgamesh…b98eee577a9b2tengm00:00:00Rshort
n50.061474697.gilgamesh…d4048f735659dtengm00:00:00Rshort
n50.061474696.gilgamesh…ef2ab9c886032tengm00:00:00Rshort
n50.061474695.gilgamesh…974fe16975f01tengm00:00:00Rshort
n50.061474694.gilgamesh…8f6edbce24dc9tengm00:00:00Rshort
n60.061474790.gilgamesh…d00c63190b64ftengm00:00:05Rshort
n60.061474789.gilgamesh…07628d86cef42tengm00:00:06Rshort
n60.061474788.gilgamesh…ddd4f6927daaftengm00:00:06Rshort
n60.061474787.gilgamesh…b128e1061cb83tengm00:00:06Rshort
n60.061474786.gilgamesh…e19f4ad64a31btengm00:00:06Rshort
n60.061474785.gilgamesh…aa57da79fc7fbtengm00:00:06Rshort
n60.061474784.gilgamesh…3472fc2cc7a80tengm00:00:05Rshort
n90.061474783.gilgamesh…95df861f67fdctengm00:00:07Rshort
n90.061474782.gilgamesh…8f13da99f2a79tengm00:00:07Rshort
n90.061474781.gilgamesh…9166d14ef3aeftengm00:00:07Rshort
n90.061474780.gilgamesh…172c6e7e414ebtengm00:00:07Rshort
n90.061474779.gilgamesh…3ba570f10a816tengm00:00:07Rshort
n90.061474778.gilgamesh…8efdc61bea56dtengm00:00:07Rshort
n90.061474777.gilgamesh…cc34a5a3b6537tengm00:00:07Rshort
n130.061474776.gilgamesh…46ba29fe6932ftengm00:00:00Rshort
n130.061474766.gilgamesh…4a0f5aed723aftengm00:00:00Rshort
n130.061474765.gilgamesh…9743cd15226c6tengm00:00:00Rshort
n130.061474721.gilgamesh…f7f6f6eb791a2tengm00:00:00Rshort
n170.061474772.gilgamesh…1ca8516b7fadctengm00:00:00Rshort
n170.061474770.gilgamesh…56a237163a283tengm00:00:00Rshort
n170.061474769.gilgamesh…87d63e4945fa0tengm00:00:00Rshort
n170.061474768.gilgamesh…d3f7d07f1933ftengm00:00:00Rshort
n170.061474775.gilgamesh…1184b68ab8a56tengm00:00:00Rshort
n170.061474771.gilgamesh…b2e3c9dbcfb6etengm00:00:00Rshort
n170.061474774.gilgamesh…947dcb393f247tengm00:00:00Rshort
n170.061474773.gilgamesh…0aef53682f2c3tengm00:00:00Rshort
n180.061474767.gilgamesh…875738357eebatengm00:02:48Rshort
n180.061474713.gilgamesh…2a1570af7cae7tengm00:01:39Rshort
n180.061474712.gilgamesh…1215223305f80tengm00:00:12Rshort
n180.061474710.gilgamesh…aa88ff3b4b367tengm00:00:00Rshort
n180.061474709.gilgamesh…87f125cfa6706tengm00:00:00Rshort
n180.061474708.gilgamesh…2331ebd7d0c0atengm00:00:00Rshort
n180.061474706.gilgamesh…5692c84eb9575tengm00:00:00Rshort
n180.061474704.gilgamesh…5bdbab742e9e5tengm00:00:00Rshort
n190.061474703.gilgamesh…9b6f8b48319c0tengm00:00:00Rshort
n190.061474702.gilgamesh…281a7c88822b5tengm00:00:00Rshort
n190.061474714.gilgamesh…c54f93ccbbe06tengm00:00:00Rshort
n190.061474656.gilgamesh…dce20edae5563tengm00:00:00Rshort
n190.061474722.gilgamesh…2d81265e5b630tengm00:07:31Rshort
n190.061474139.gilgamesh…/site=0tianyug100:00:00Rlong
n220.061474682.gilgamesh…0ca786bf9768atengm00:00:00Rshort
n220.061474717.gilgamesh…746dd6cd86fc8tengm00:00:00Rshort
n220.061474648.gilgamesh…b0a1c7622e12btengm00:00:00Rshort
n220.061474762.gilgamesh…92ba8c73df30etengm00:00:00Rshort
n220.061474718.gilgamesh…7ec94ae410ba3tengm00:00:00Rshort
n220.061474705.gilgamesh…7b7323a7bd06ftengm00:00:00Rshort
n220.061474691.gilgamesh…0fd0e2a6b9f7ctengm00:00:00Rshort
n220.061474711.gilgamesh…c8022c3e03dd4tengm00:00:00Rshort
n220.061474681.gilgamesh…7194b53ab3e4btengm00:00:00Rshort
n220.061474723.gilgamesh…909918af0e09atengm00:00:00Rshort
n220.061474688.gilgamesh…88d537f8d88fdtengm00:00:00Rshort
n220.061474679.gilgamesh…93bf47bd34298tengm00:00:00Rshort
n230.061474615.gilgamesh…22cf0cbf79127tengm00:00:00Rshort
n230.061474685.gilgamesh…593541e820d42tengm00:00:03Rshort
n230.061474684.gilgamesh…7e2c05533acbftengm00:00:00Rshort
n230.061474680.gilgamesh…01e003afcc633tengm00:00:01Rshort
n230.061474646.gilgamesh…71c8c3aa05fc3tengm00:00:00Rshort
n230.061474653.gilgamesh…588ebb98a1591tengm00:00:01Rshort
n230.061474652.gilgamesh…51a0572519845tengm00:00:00Rshort
n250.061474715.gilgamesh…38d825051255btengm00:00:00Rshort
n250.061474707.gilgamesh…6d46ca9169d24tengm00:00:00Rshort
n250.061474759.gilgamesh…cc2613d97e530tengm00:00:00Rshort
n250.061474701.gilgamesh…a2322e597dbc0tengm00:00:00Rshort
n250.061474760.gilgamesh…cddeff1aadc7etengm00:00:00Rshort
n250.061474145.gilgamesh…/site=terracetianyug100:00:00Rlong
n250.061474716.gilgamesh…e1250fcc29a39tengm00:00:00Rshort
n250.061474761.gilgamesh…5df2050b4efc9tengm00:00:00Rshort
n260.061470328.gilgamesh…c3ec9c78aa455jboes00:00:00Rshort
n290.061474144.gilgamesh…/site=1tianyug100:00:02Rlong
n290.061474143.gilgamesh…/site=0tianyug100:00:02Rlong
n290.061474791.gilgamesh…64f8fc6b33cd8tengm00:00:00Rshort
n290.061469907.gilgamesh…/0-1-nebtianyug100:00:00Rlong
n290.061474142.gilgamesh…class=perfecttianyug100:00:02Rlong
n290.061474140.gilgamesh…/site=1tianyug100:00:02Rlong
n250.051471462.gilgamesh…vib-dt1-496upfgeng41:12:37Rlong
n290.031471457.gilgamesh…vib-dt1-496upfgeng39:51:42Rlong
n160.011474597.gilgameshCondVR.shazeeshan00:00:00Rlong

job property

(defun job-property (jobid &rest xpath)
  (let* ((xml (with-temp-buffer
		(insert (shell-command-to-string (format "qstat -f -x %s" jobid)))
		(car (xml-parse-region (point-min) (point-max)))))
		(job (car (xml-get-children xml 'Job)))	 )
    (loop for x in xpath
	  with node = job
	  do
	  (setq node (car (xml-get-children node x)))
	  (message "%s %s" node x)
	  finally
	  return (nth 2 node))))


  (job-property "1430240" 'resources_used 'vmem)
#!/bin/bash

pwd
(loop for node in '("n15" "n16" "n17" "n18" "n19" "n22" "n23")
      append
      (with-temp-buffer
	(insert (shell-command-to-string (format "pbsnodes %s" node)))

	;; now find jobs, and replace them with links.
	(goto-char (point-min))
	(let ((found-jobs '()))
	  (while (re-search-forward "[0-9]*/\\([0-9]*.gilgamesh.cheme.cmu.edu\\)" nil t)
	    (add-to-list 'found-jobs (match-string 1))
	    (setf (buffer-substring (match-beginning 0) (match-end 0))
		  (format "[[job:%s][%s]]" (match-string 1) (match-string 0)))
	    ;; go to end of link we just inserted to avoid finding this link again.
	    (re-search-forward "]]"))
	  (loop for job in found-jobs
		collect (append (list node)
				      (split-string
				       (nth 2 (split-string (shell-command-to-string
							     (format "qstat %s" job))
							     "\n"))))))))

<2017-01-12 Thu>

(loop for node in '("n23" "n24" "n25" "n26" "n27" "n28" "n29")
      append
      (with-temp-buffer
	(insert (shell-command-to-string (format "pbsnodes %s" node)))

	;; now find jobs, and replace them with links.
	(goto-char (point-min))
	(let ((found-jobs '()))
	  (while (re-search-forward "[0-9]*/\\([0-9]*.gilgamesh.cheme.cmu.edu\\)" nil t)
	    (add-to-list 'found-jobs (match-string 1))
	    (setf (buffer-substring (match-beginning 0) (match-end 0))
		  (format "[[job:%s][%s]]" (match-string 1) (match-string 0)))
	    ;; go to end of link we just inserted to avoid finding this link again.
	    (re-search-forward "]]"))
	  (loop for job in found-jobs
		collect (append (list node)
				      (split-string
				       (nth 2 (split-string (shell-command-to-string
							     (format "qstat %s" job))
							     "\n"))))))))
(loop for node in '("n7" "n15" "n16" "n18" "n19" "n22" "n23")
      append
      (with-temp-buffer
	(insert (shell-command-to-string (format "pbsnodes %s" node)))

	;; now find jobs, and replace them with links.
	(goto-char (point-min))
	(let ((found-jobs '()))
	  (while (re-search-forward "[0-9]*/\\([0-9]*.gilgamesh.cheme.cmu.edu\\)" nil t)
	    (add-to-list 'found-jobs (match-string 1))
	    (setf (buffer-substring (match-beginning 0) (match-end 0))
		  (format "[[job:%s][%s]]" (match-string 1) (match-string 0)))
	    ;; go to end of link we just inserted to avoid finding this link again.
	    (re-search-forward "]]"))
	  (loop for job in found-jobs
		collect (append (list node)
				      (split-string
				       (nth 2 (split-string (shell-command-to-string
							     (format "qstat %s" job))
							     "\n"))))))))