Skip to content
Snippets Groups Projects
Commit fcef1381 authored by ye.lu's avatar ye.lu
Browse files

update for dev

parent 71b9daa2
No related branches found
No related tags found
No related merge requests found
import sys, math, os, re, shutil, time
from subprocess import Popen, run
from subprocess import Popen, run, PIPE
class MPIController:
def __init__(self, numCores=1 , numMembers=1 , strMember=".mem",\
......@@ -103,7 +103,7 @@ class MPIController:
""" Two time format: passtime or exacttime
"""
time_now = time.gmtime()
self.CheckTime = time.time_ns() * 10 ** -9
self.CheckTime = time.time()
self.PassTime = self.CheckTime - self.StartTime
str_time_now = "{0:04d}-{1:02d}-{2:02d}_{3:02d}:{4:02d}:{5:02d}".format( \
time_now. tm_year , time_now. tm_mon , time_now. tm_mday ,\
......@@ -161,12 +161,14 @@ class MPIController:
class SlurmController:
def __init__(self, strProject="", strPartition="", numCoresPerNode=0, strMember=".mem", strRunFolder="run", strJobname="", strRootdir=".", ifVerbose=False, strMachine=None, ifServerLog=False, strServerLog="EFL_ServerLog"):
def __init__(self, strProject="", strPartition="", numCoresPerNode=0, strMember=".mem", strRunFolder="run", strJobname="", strRootDir=".", ifVerbose=False, strMachine=None, ifServerLog=True , strServerLog="EFL_ServerLog"):
self.str_project = strProject
self.str_partition = strPartition
self.corespernode = numCoresPerNode
self.CoresPerNode = numCoresPerNode
self.strPreName = strMember
self.strRunDir = strRunFolder
self.strRootDir = os.path.abspath(strRootDir)
self.if_verbose = ifVerbose
self.dicRunMembers = {}
if strJobname != "":
......@@ -175,19 +177,21 @@ class SlurmController:
self.str_jobname = "EnsemPy"
self.str_outname = "{0:s}-out".format(self.str_jobname)
self.str_errname = "{0:s}-err".format(self.str_jobname)
self.strRootDir = strRootdir
if strMachine == "JUWELS":
self.corespernode = 48
if strMachine == "JURECA":
self.corespernode = 64
self.if_serverlog = ifServerLog
if ifServerLog:
self.strServerLog = "{0:s}/{1:s}".format(strRootdir, strServerLog)
self.strServerLog = "{0:s}/{1:s}".format(strRootDir, strServerLog)
fileLog = open(self.strServerLog, "w")
fileLog.close()
self.num_waiting = 5
self.num_ping_file = 5
self.StartTime = time.time_ns() * 10 ** -9
self.arr_hostnames = []
self.StartTime = time.time()
self.ifRun = True
self.numTimeSleep = 1.0
def ChangeServerLog(self, fileOutput):
self.fileServerLog = open(fileOutput, "w")
......@@ -198,28 +202,29 @@ class SlurmController:
else:
self.corespermember = UsingCores
self.nodespermember = UsingNodes
self.CoresPerMember = UsingNodes * self. corespernode
self.numMembers = members
self.arr_hostpermember = [ "" for m in range(members)]
try:
len(self.arr_hostnames)
except:
print("No Nodelist, obtaining automatically")
self.CheckNodelist()
self.CheckNodelist()
print(members, UsingNodes)
for m in range(members):
for n in range(UsingNodes):
for n in range(self.nodespermember):
print(self.arr_hostpermember)
if n == 0:
self.arr_hostpermember[m] = "{}".format(self.arr_hostnames[ m*UsingNodes + n ])
self.arr_hostpermember[m] = "{}".format(self.arr_hostnames[ m*self.nodespermember + n ])
elif n == UsingNodes-1:
strPreName = self.arr_hostpermember[m]
self.arr_hostpermember[m] = "{},{}".format(strPreName, self.arr_hostnames[ m*UsingNodes + n ])
self.arr_hostpermember[m] = "{},{}".format(strPreName, self.arr_hostnames[ m*self.nodespermember + n ])
else:
strPreName = self.arr_hostpermember[m]
self.arr_hostpermember[m] = "{},{}".format(strPreName, self.arr_hostnames[ m*UsingNodes + n ])
self.arr_hostpermember[m] = "{},{}".format(strPreName, self.arr_hostnames[ m*self.nodespermember + n ])
print(self.arr_hostpermember)
def CheckNodelist(self):
str_hostnames = subprocess.run(["scontrol", "show", "hostnames"], capture_output=True).stdout
try:
str_hostnames = run(["scontrol", "show", "hostnames"], capture_output=True).stdout
except:
str_hostnames = run(["scontrol", "show", "hostnames"], stdout = PIPE ).stdout
arr_hostnames_tmp = str_hostnames.split(b"\n")
self.arr_hostnames = []
for ind, item in enumerate(arr_hostnames_tmp):
......@@ -238,7 +243,7 @@ class SlurmController:
os.mkdir("{0:s}".format(strRunfolderOut))
os.chdir("{0:s}".format(strRunfolderOut))
for item in self.arr_runfolder_files:
subprocess.run(["ln","-s", "{0:s}/{1:s}/{2:s}".format(self.strRootDir, self.strRunDir, item), "."])
run(["ln","-s", "{0:s}/{1:s}/{2:s}".format(self.strRootDir, self.strRunDir, item), "."])
def CreateMembers(self, arrException=[], if_force=True, if_hardlink=False):
......@@ -289,6 +294,7 @@ class SlurmController:
""" To design the running childs depends on the topology of
the nodes and cores and members.
"""
arrMembers = []
if strExecutor == "":
print("FATAL ERROR: you did not specific the executor's name")
for ind in range(self.numMembers):
......@@ -296,23 +302,43 @@ class SlurmController:
os.chdir("{0:s}".format(strRunfolderOut))
self.ServerLog("Mem: {3:04d}, Nodes: {0:4d}, Cores: {1:4d}, Nodelist: {2:s}"\
.format(self.nodespermember, self.corespermember, self.arr_hostpermember[ind], ind))
Child = Popen(["srun", \
"-N", "{0:d}".format(self.nodespermember),\
"-n", "{0:d}".format(self.corespermember),\
"--verbose",\
"--nodelist={0:s}".format(self.arr_hostpermember[ind]),\
"--output={0:s}".format(self.str_outname),\
"--error={0:s}".format(self.str_outname),\
"--job-name={0:s}".format(self.str_jobname),\
"{0:s}".format(strExecutor),"&"], \
cmd_in = re.split("\s", strExecutor )
arrPopen = ["srun", \
"-N", "{0:d}".format(self.nodespermember),\
"-n", "{0:d}".format(self.corespermember),\
"--verbose",\
"--nodelist={0:s}".format(self.arr_hostpermember[ind]),\
"--output={0:s}".format(self.str_outname),\
"--error={0:s}".format(self.str_outname),\
"--job-name={0:s}".format(self.str_jobname)]
for cmd in cmd_in:
arrPopen.append(cmd)
arrPopen.append("&")
Child = Popen( arrPopen,
stdin=None, stdout=None, stderr=None, close_fds=True,\
start_new_session=True )
self.dicRunMembers["{}".format(Child.pid) : {"Child" : Child , \
"Signal": Child.poll(), }]
self.dicRunMembers["{}".format(Child.pid)] = {"Child" : Child , \
"Signal": Child.poll() }
arrMembers.append(Child)
os.chdir("{0:s}".format(self.strRootDir))
return arrMembers
def KeepAlive(self):
num_proc_in = len(self.dicRunMembers.keys())
chk_proc = [ False for i in range(num_proc_in) ]
chk_stop = [ True for i in range(num_proc_in) ]
while self.ifRun:
for ind, child in enumerate( self.dicRunMembers.keys() ):
if self.dicRunMembers[child]["Child"].poll() == None:
pass
else:
chk_proc[ind] = True
self.ifRun = not (chk_proc == chk_stop)
time.sleep(self.numTimeSleep)
print("All Child is dead")
def CheckMembersWRF(self, numWaitingTime=5 , ifExit=True, numWRFinitTime=10):
time.sleep(numWRFinitTime)
num_finished = 0
......@@ -376,7 +402,7 @@ class SlurmController:
"""
time_now = time.gmtime()
self.CheckTime = time.time_ns() * 10 ** -9
self.CheckTime = time.time()
self.PassTime = self.CheckTime - self.StartTime
str_time_now = "{0:04d}-{1:02d}-{2:02d}_{3:02d}:{4:02d}:{5:02d}".format( \
time_now. tm_year , time_now. tm_mon , time_now. tm_mday ,\
......
import sys, math, os, re, shutil, time
from subprocess import Popen, run
from subprocess import Popen, run, PIPE
class MPIController:
......@@ -221,7 +221,10 @@ class SlurmController:
self.arr_hostpermember[m] = "{},{}".format(strPreName, self.arr_hostnames[ m*UsingNodes + n ])
def CheckNodelist(self):
str_hostnames = subprocess.run(["scontrol", "show", "hostnames"], capture_output=True).stdout
try:
str_hostnames = subprocess.run(["scontrol", "show", "hostnames"], capture_output=True).stdout
except:
str_hostnames = subprocess.run(["scontrol", "show", "hostnames"], stdout = PIPE).stdout
arr_hostnames_tmp = str_hostnames.split(b"\n")
self.arr_hostnames = []
for ind, item in enumerate(arr_hostnames_tmp):
......
/Data/Work/CODES/PYTHON/LEFpy/test_ground/run/mpi_test.out
\ No newline at end of file
/p/project/exaww/lu3/APPS/les-py/test_ground/run/mpi_test.out
\ No newline at end of file
/Data/Work/CODES/PYTHON/LEFpy/test_ground/run/test.out
\ No newline at end of file
/p/project/exaww/lu3/APPS/les-py/test_ground/run/test.out
\ No newline at end of file
/Data/Work/CODES/PYTHON/LEFpy/test_ground/run/mpi_test.out
\ No newline at end of file
/p/project/exaww/lu3/APPS/les-py/test_ground/run/mpi_test.out
\ No newline at end of file
/Data/Work/CODES/PYTHON/LEFpy/test_ground/run/test.out
\ No newline at end of file
/p/project/exaww/lu3/APPS/les-py/test_ground/run/test.out
\ No newline at end of file
/Data/Work/CODES/PYTHON/LEFpy/test_ground/run/mpi_test.out
\ No newline at end of file
/p/project/exaww/lu3/APPS/les-py/test_ground/run/mpi_test.out
\ No newline at end of file
/Data/Work/CODES/PYTHON/LEFpy/test_ground/run/test.out
\ No newline at end of file
/p/project/exaww/lu3/APPS/les-py/test_ground/run/test.out
\ No newline at end of file
#!/usr/bin/python3
from LightweightEnsembleFramework import MPIController
from LightweightEnsembleFramework import SlurmController
MPIC = MPIController(numCores=6, numMembers=3, strRunDir="run")
MPIC = SlurmController(numCoresPerNode=10 , strRunFolder="run")
MPIC.InitEnsemble()
MPIC.CreateMembers(ifForce=True)
MPIC.RunMembers("mpi_test.out 2 0")
MPIC.InitEnsemble(members=2 , UsingNodes=2)
MPIC.corespermember = 20
MPIC.CreateMembers(if_force=True)
print("Done Creating Members")
MPIC.RunMembers("mpi_test_gcc.out 600 1")
MPIC.KeepAlive()
print("Information:".format(MPIC.numCoresPerMember, MPIC.numCores))
#print("Information:".format(MPIC.numCoresPerMember, MPIC.numCores))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment