Hi there,
Im trying to do some ancestry stuff after getting vcf from a nextflow pipeline.
I used conda kernel to run jupyternotebook on hpc vscode.
import os
os.environ["JAVA_HOME"] = "/home1/xiesheng/jdk-21.0.6"
os.environ["SPARK_HOME"] = "/home1/xiesheng/spark-3.5.5-bin-hadoop3"
from pyspark.sql import SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession
import hail as hl
HAIL_DIR = os.path.dirname(hl.__file__)
HAIL_JAR = os.path.join(HAIL_DIR, 'backend', 'hail-all-spark.jar')
spark = SparkSession.builder \
.master("local[*]") \
.appName("hail_session") \
.config("spark.jars", HAIL_JAR) \
.config("spark.driver.extraClassPath", HAIL_JAR) \
.config("spark.executor.extraClassPath", "./hail-all-spark.jar") \
.getOrCreate()
hl.init(sc=spark.sparkContext, idempotent=True, quiet=True, skip_logging_configuration=True)
Got error:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Cell In[10], line 14
4 HAIL_JAR = os.path.join(HAIL_DIR, 'backend', 'hail-all-spark.jar')
6 spark = SparkSession.builder \
7 .master("local[*]") \
8 .appName("hail_session") \
(...)
11 .config("spark.executor.extraClassPath", "./hail-all-spark.jar") \
12 .getOrCreate()
---> 14 hl.init(sc=spark.sparkContext, idempotent=True, quiet=True, skip_logging_configuration=True)
File <decorator-gen-1786>:2, in init(sc, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmp_dir, default_reference, idempotent, global_seed, spark_conf, skip_logging_configuration, local_tmpdir, _optimizer_iterations, backend, driver_cores, driver_memory, worker_cores, worker_memory, gcs_requester_pays_configuration, regions)
File ~/.local/lib/python3.10/site-packages/hail/typecheck/check.py:584, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
581 @decorator
582 def wrapper(__original_func, *args, **kwargs):
583 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 584 return __original_func(*args_, **kwargs_)
File ~/.local/lib/python3.10/site-packages/hail/context.py:349, in init(sc, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmp_dir, default_reference, idempotent, global_seed, spark_conf, skip_logging_configuration, local_tmpdir, _optimizer_iterations, backend, driver_cores, driver_memory, worker_cores, worker_memory, gcs_requester_pays_configuration, regions)
332 return asyncio.get_event_loop().run_until_complete(init_batch(
333 log=log,
334 quiet=quiet,
(...)
346 regions=regions
347 ))
348 if backend == 'spark':
--> 349 return init_spark(
350 sc=sc,
351 app_name=app_name,
352 master=master,
353 local=local,
354 min_block_size=min_block_size,
355 branching_factor=branching_factor,
356 spark_conf=spark_conf,
357 _optimizer_iterations=_optimizer_iterations,
358 log=log,
359 quiet=quiet,
360 append=append,
361 tmp_dir=tmp_dir,
362 local_tmpdir=local_tmpdir,
363 default_reference=default_reference,
364 global_seed=global_seed,
365 skip_logging_configuration=skip_logging_configuration,
366 gcs_requester_pays_configuration=gcs_requester_pays_configuration
367 )
368 if backend == 'local':
369 return init_local(
370 log=log,
371 quiet=quiet,
(...)
377 gcs_requester_pays_configuration=gcs_requester_pays_configuration
378 )
File <decorator-gen-1788>:2, in init_spark(sc, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmp_dir, default_reference, idempotent, global_seed, spark_conf, skip_logging_configuration, local_tmpdir, _optimizer_iterations, gcs_requester_pays_configuration)
File ~/.local/lib/python3.10/site-packages/hail/typecheck/check.py:584, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
581 @decorator
582 def wrapper(__original_func, *args, **kwargs):
583 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 584 return __original_func(*args_, **kwargs_)
File ~/.local/lib/python3.10/site-packages/hail/context.py:428, in init_spark(sc, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmp_dir, default_reference, idempotent, global_seed, spark_conf, skip_logging_configuration, local_tmpdir, _optimizer_iterations, gcs_requester_pays_configuration)
426 app_name = app_name or 'Hail'
427 gcs_requester_pays_project, gcs_requester_pays_buckets = convert_gcs_requester_pays_configuration_to_hadoop_conf_style(gcs_requester_pays_configuration)
--> 428 backend = SparkBackend(
429 idempotent, sc, spark_conf, app_name, master, local, log,
430 quiet, append, min_block_size, branching_factor, tmpdir, local_tmpdir,
431 skip_logging_configuration, optimizer_iterations,
432 gcs_requester_pays_project=gcs_requester_pays_project,
433 gcs_requester_pays_buckets=gcs_requester_pays_buckets
434 )
435 if not backend.fs.exists(tmpdir):
436 backend.fs.mkdir(tmpdir)
File ~/.local/lib/python3.10/site-packages/hail/backend/spark_backend.py:208, in SparkBackend.__init__(self, idempotent, sc, spark_conf, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmpdir, local_tmpdir, skip_logging_configuration, optimizer_iterations, gcs_requester_pays_project, gcs_requester_pays_buckets)
205 self._jhc = hail_package.HailContext.getOrCreate(
206 self._jbackend, branching_factor, optimizer_iterations)
207 else:
--> 208 self._jbackend = hail_package.backend.spark.SparkBackend.apply(
209 jsc, app_name, master, local, log, True, append, skip_logging_configuration, min_block_size, tmpdir, local_tmpdir,
210 gcs_requester_pays_project, gcs_requester_pays_buckets)
211 self._jhc = hail_package.HailContext.apply(
212 self._jbackend, branching_factor, optimizer_iterations)
214 self._jsc = self._jbackend.sc()
File ~/.local/lib/python3.10/site-packages/py4j/java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +\
1316 self.command_header +\
1317 args_command +\
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File ~/.local/lib/python3.10/site-packages/pyspark/sql/utils.py:190, in capture_sql_exception.<locals>.deco(*a, **kw)
188 def deco(*a: Any, **kw: Any) -> Any:
189 try:
--> 190 return f(*a, **kw)
191 except Py4JJavaError as e:
192 converted = convert_exception(e.java_exception)
File ~/.local/lib/python3.10/site-packages/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.\n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:\n{3}\n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling z:is.hail.backend.spark.SparkBackend.apply.
: is.hail.utils.HailException: Found problems with SparkContext configuration:
Invalid configuration property spark.serializer: required org.apache.spark.serializer.KryoSerializer. Found: empty parameter.
Invalid config parameter: spark.kryo.registrator must include is.hail.kryo.HailKryoRegistrator.Found empty parameter.
at is.hail.utils.ErrorHandling.fatal(ErrorHandling.scala:17)
at is.hail.utils.ErrorHandling.fatal$(ErrorHandling.scala:17)
at is.hail.utils.package$.fatal(package.scala:78)
at is.hail.backend.spark.SparkBackend$.checkSparkConfiguration(SparkBackend.scala:169)
at is.hail.backend.spark.SparkBackend$.apply(SparkBackend.scala:244)
at is.hail.backend.spark.SparkBackend.apply(SparkBackend.scala)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:834)
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[5], line 7
2 from gnomad.sample_qc.ancestry import (
3 apply_onnx_classification_model,
4 assign_population_pcs,
5 )
6 from gnomad.utils.filtering import filter_to_adj
----> 7 from gnomad_qc.v2.resources.basics import get_gnomad_meta
8 from gnomad_qc.v4.resources.basics import get_checkpoint_path
ModuleNotFoundError: No module named 'gnomad_qc'
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Cell In[7], line 3
1 import onnx
2 import hail as hl
----> 3 hl.init()
File <decorator-gen-1786>:2, in init(sc, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmp_dir, default_reference, idempotent, global_seed, spark_conf, skip_logging_configuration, local_tmpdir, _optimizer_iterations, backend, driver_cores, driver_memory, worker_cores, worker_memory, gcs_requester_pays_configuration, regions)
File ~/.local/lib/python3.10/site-packages/hail/typecheck/check.py:584, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
581 @decorator
582 def wrapper(__original_func, *args, **kwargs):
583 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 584 return __original_func(*args_, **kwargs_)
File ~/.local/lib/python3.10/site-packages/hail/context.py:349, in init(sc, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmp_dir, default_reference, idempotent, global_seed, spark_conf, skip_logging_configuration, local_tmpdir, _optimizer_iterations, backend, driver_cores, driver_memory, worker_cores, worker_memory, gcs_requester_pays_configuration, regions)
332 return asyncio.get_event_loop().run_until_complete(init_batch(
333 log=log,
334 quiet=quiet,
(...)
346 regions=regions
347 ))
348 if backend == 'spark':
--> 349 return init_spark(
350 sc=sc,
...
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:834)
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...
After digging around for some days I still don’t have much idea what’s going on.
Any help is greatly appreciated!