[SOLVED] Multiple users in HPC, Possible Concurrency/Threading problem

NOTE, it seems like a new update to hail has solved this problem. If anyone wants to comment or comment how to delete this post, please do so :slight_smile:

------ START OUTDATED POST ------
I’ve been trying to get Hail working in a HPC environment. I was hoping to get multiple users to work on hail at the same time using the same shared filesystem. My design was to use a central code and library repository where there is a $CODE_HOME/hail/ and a $CODE_HOME/miniconda/ python installation, which all users PATHs are pointing to. This worked fine for both interactive and spark-submit uses with a single user, but today when I was testing with multiple users the HailContext would fail to form intermittently on a call to hc = HailContext() with either one of two errors. Note, each user today was ssh’ed into a different node and we were all using different jupyter notebooks simultaneously. There were five of us, and everytime we would all try to start HailContext at least one of us would fail out with these errors. Most of the time all five of us would fail out. Also note that concurrent calls to python only would be fine, with from hail import * working fine.

Any help at all would be wonderful, as we would really like to work collaboratively on the cluster at the same time and all be referencing the same hail and python installations so we can keep our code synchronized.

The first error that we would get would be

---------
OSError                                   Traceback (most recent call last)
<ipython-input-11-2841f1963bb0> in <module>()
----> 1 hc_rav = HailContext()

/scratch/PI/dpwall/computeEnvironments/hail/python/hail/context.pyc in __init__(self, sc, appName, master, local, log, quiet, append, parquet_compression, min_block_size, branching_factor, tmp_dir)
    45
    46         from pyspark import SparkContext
---> 47         SparkContext._ensure_initialized()
    48
    49         self._gateway = SparkContext._gateway

/share/sw/free/spark.2.1.0/spark-2.1.0-bin-hadoop2.7/python/pyspark/context.py in _ensure_initialized(cls, instance, gateway, conf)
   254         with SparkContext._lock:
   255             if not SparkContext._gateway:
--> 256                 SparkContext._gateway = gateway or launch_gateway(conf)
   257                 SparkContext._jvm = SparkContext._gateway.jvm
   258

/share/sw/free/spark.2.1.0/spark-2.1.0-bin-hadoop2.7/python/pyspark/java_gateway.py in launch_gateway(conf)
    75             def preexec_func():
    76                 signal.signal(signal.SIGINT, signal.SIG_IGN)
---> 77             proc = Popen(command, stdin=PIPE, preexec_fn=preexec_func, env=env)
    78         else:
    79             # preexec_fn not supported on Windows

/scratch/PI/dpwall/computeEnvironments/miniconda2/lib/python2.7/subprocess.pyc in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags)
   388                                 p2cread, p2cwrite,
   389                                 c2pread, c2pwrite,
--> 390                                 errread, errwrite)
   391         except Exception:
   392             # Preserve original exception in case os.close raises.

/scratch/PI/dpwall/computeEnvironments/miniconda2/lib/python2.7/subprocess.pyc in _execute_child(self, args, executable, preexec_fn, close_fds, cwd, env, universal_newlines, startupinfo, creationflags, shell, to_close, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite)
  1022                         raise
  1023                 child_exception = pickle.loads(data)
-> 1024                 raise child_exception
  1025
  1026

OSError: [Errno 2] No such file or directory

and the second error we would get would be

-------------------------------------------------------------------------
Py4JJavaError                           Traceback (most recent call last)
<ipython-input-6-93fa734a63bb> in <module>()
----> 1 hc_nate = HailContext()

/scratch/PI/dpwall/computeEnvironments/hail/python/hail/context.pyc in __init__(self, sc, appName, master, local, log, quiet, append, parquet_compression, min_block_size, branching_factor, tmp_dir)
     60         self._jhc = scala_object(self._hail, 'HailContext').apply(
     61             jsc, appName, joption(master), local, log, quiet, append,
---> 62             parquet_compression, min_block_size, branching_factor, tmp_dir)
     63 
     64         self._jsc = self._jhc.sc()

/share/sw/free/spark.2.1.0/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py in __call__(self, *args)
   1131         answer = self.gateway_client.send_command(command)
   1132         return_value = get_return_value(
-> 1133             answer, self.gateway_client, self.target_id, self.name)
   1134 
   1135         for temp_arg in temp_args:

/share/sw/free/spark.2.1.0/spark-2.1.0-bin-hadoop2.7/python/pyspark/sql/utils.py in deco(*a, **kw)
     61     def deco(*a, **kw):
     62         try:
---> 63             return f(*a, **kw)
     64         except py4j.protocol.Py4JJavaError as e:
     65             s = e.java_exception.toString()

/share/sw/free/spark.2.1.0/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
    317                 raise Py4JJavaError(
    318                     "An error occurred while calling {0}{1}{2}.\n".
--> 319                     format(target_id, ".", name), value)
    320             else:
    321                 raise Py4JError(

Py4JJavaError: An error occurred while calling o68.apply.
: org.apache.spark.SparkException: Only one SparkContext may be running in this JVM (see SPARK-2243). To ignore this error, set spark.driver.allowMultipleContexts = true. The currently running SparkContext was created at:
org.apache.spark.SparkContext.<init>(SparkContext.scala:76)
is.hail.HailContext$.configureAndCreateSparkContext(HailContext.scala:84)
is.hail.HailContext$.apply(HailContext.scala:164)
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.lang.reflect.Method.invoke(Method.java:498)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:280)
py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
py4j.commands.CallCommand.execute(CallCommand.java:79)
py4j.GatewayConnection.run(GatewayConnection.java:214)
java.lang.Thread.run(Thread.java:745)
	at org.apache.spark.SparkContext$$anonfun$assertNoOtherContextIsRunning$2.apply(SparkContext.scala:2278)
	at org.apache.spark.SparkContext$$anonfun$assertNoOtherContextIsRunning$2.apply(SparkContext.scala:2274)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.SparkContext$.assertNoOtherContextIsRunning(SparkContext.scala:2274)
	at org.apache.spark.SparkContext$.markPartiallyConstructed(SparkContext.scala:2353)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:85)
	at is.hail.HailContext$.configureAndCreateSparkContext(HailContext.scala:84)
	at is.hail.HailContext$.apply(HailContext.scala:164)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)

Thank you so much!!