Thank you for the reply. The error happens when I am using the filter_variants function. I have a PRS with 40000 SNPs and the locus_list is quite long as the SNPs are sparse all over the genome. This is how I generate the locus list:
test_intervals = [f"chr{row.hm_chr}:{int(row.hm_pos)}-chr{row.hm_chr}:{int(row.hm_pos)+1}"
for idx, row in tqdm(prs.iterrows(), total= len(prs))]
The complete traceback is this:
LineTooLong Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:703, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
705 method,
706 url,
707 timeout=timeout_obj,
708 body=body,
709 headers=headers,
710 chunked=chunked,
711 )
713 # If we're going to release the connection in ``finally:``, then
714 # the response doesn't need to know about the connection. Otherwise
715 # it will also try to release it and we'll have a double-release
716 # mess.
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:449, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
445 except BaseException as e:
446 # Remove the TypeError from the exception chain in
447 # Python 3 (including for exceptions like SystemExit).
448 # Otherwise it looks like a bug in the code.
--> 449 six.raise_from(e, None)
450 except (SocketTimeout, BaseSSLError, SocketError) as e:
File <string>:3, in raise_from(value, from_value)
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:444, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
443 try:
--> 444 httplib_response = conn.getresponse()
445 except BaseException as e:
446 # Remove the TypeError from the exception chain in
447 # Python 3 (including for exceptions like SystemExit).
448 # Otherwise it looks like a bug in the code.
File /opt/conda/lib/python3.10/http/client.py:1375, in HTTPConnection.getresponse(self)
1374 try:
-> 1375 response.begin()
1376 except ConnectionError:
File /opt/conda/lib/python3.10/http/client.py:337, in HTTPResponse.begin(self)
335 raise UnknownProtocol(version)
--> 337 self.headers = self.msg = parse_headers(self.fp)
339 if self.debuglevel > 0:
File /opt/conda/lib/python3.10/http/client.py:234, in parse_headers(fp, _class)
225 """Parses only RFC2822 headers from a file pointer.
226
227 email Parser wants to see strings rather than bytes.
(...)
232
233 """
--> 234 headers = _read_headers(fp)
235 hstring = b''.join(headers).decode('iso-8859-1')
File /opt/conda/lib/python3.10/http/client.py:216, in _read_headers(fp)
215 if len(line) > _MAXLINE:
--> 216 raise LineTooLong("header line")
217 headers.append(line)
LineTooLong: got more than 1048576 bytes when reading header line
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/requests/adapters.py:667, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
666 try:
--> 667 resp = conn.urlopen(
668 method=request.method,
669 url=url,
670 body=request.body,
671 headers=request.headers,
672 redirect=False,
673 assert_same_host=False,
674 preload_content=False,
675 decode_content=False,
676 retries=self.max_retries,
677 timeout=timeout,
678 chunked=chunked,
679 )
681 except (ProtocolError, OSError) as err:
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:787, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
785 e = ProtocolError("Connection aborted.", e)
--> 787 retries = retries.increment(
788 method, url, error=e, _pool=self, _stacktrace=sys.exc_info()[2]
789 )
790 retries.sleep()
File /opt/conda/lib/python3.10/site-packages/urllib3/util/retry.py:550, in Retry.increment(self, method, url, response, error, _pool, _stacktrace)
549 if read is False or not self._is_method_retryable(method):
--> 550 raise six.reraise(type(error), error, _stacktrace)
551 elif read is not None:
File /opt/conda/lib/python3.10/site-packages/urllib3/packages/six.py:769, in reraise(tp, value, tb)
768 if value.__traceback__ is not tb:
--> 769 raise value.with_traceback(tb)
770 raise value
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:703, in HTTPConnectionPool.urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
702 # Make the request on the httplib connection object.
--> 703 httplib_response = self._make_request(
704 conn,
705 method,
706 url,
707 timeout=timeout_obj,
708 body=body,
709 headers=headers,
710 chunked=chunked,
711 )
713 # If we're going to release the connection in ``finally:``, then
714 # the response doesn't need to know about the connection. Otherwise
715 # it will also try to release it and we'll have a double-release
716 # mess.
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:449, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
445 except BaseException as e:
446 # Remove the TypeError from the exception chain in
447 # Python 3 (including for exceptions like SystemExit).
448 # Otherwise it looks like a bug in the code.
--> 449 six.raise_from(e, None)
450 except (SocketTimeout, BaseSSLError, SocketError) as e:
File <string>:3, in raise_from(value, from_value)
File /opt/conda/lib/python3.10/site-packages/urllib3/connectionpool.py:444, in HTTPConnectionPool._make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
443 try:
--> 444 httplib_response = conn.getresponse()
445 except BaseException as e:
446 # Remove the TypeError from the exception chain in
447 # Python 3 (including for exceptions like SystemExit).
448 # Otherwise it looks like a bug in the code.
File /opt/conda/lib/python3.10/http/client.py:1375, in HTTPConnection.getresponse(self)
1374 try:
-> 1375 response.begin()
1376 except ConnectionError:
File /opt/conda/lib/python3.10/http/client.py:337, in HTTPResponse.begin(self)
335 raise UnknownProtocol(version)
--> 337 self.headers = self.msg = parse_headers(self.fp)
339 if self.debuglevel > 0:
File /opt/conda/lib/python3.10/http/client.py:234, in parse_headers(fp, _class)
225 """Parses only RFC2822 headers from a file pointer.
226
227 email Parser wants to see strings rather than bytes.
(...)
232
233 """
--> 234 headers = _read_headers(fp)
235 hstring = b''.join(headers).decode('iso-8859-1')
File /opt/conda/lib/python3.10/http/client.py:216, in _read_headers(fp)
215 if len(line) > _MAXLINE:
--> 216 raise LineTooLong("header line")
217 headers.append(line)
ProtocolError: ('Connection aborted.', LineTooLong('got more than 1048576 bytes when reading header line'))
During handling of the above exception, another exception occurred:
ConnectionError Traceback (most recent call last)
Cell In[12], line 2
1 print("filtering variants")
----> 2 vds = hl.vds.filter_intervals(vds, locus_list[:5000])
3 print("variants after filter:", vds.variant_data.count())
4 print("filtering samples")
File <decorator-gen-1870>:2, in filter_intervals(vds, intervals, split_reference_blocks, keep)
File /opt/conda/lib/python3.10/site-packages/hail/typecheck/check.py:585, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
582 @decorator
583 def wrapper(__original_func: Callable[..., T], *args, **kwargs) -> T:
584 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 585 return __original_func(*args_, **kwargs_)
File /opt/conda/lib/python3.10/site-packages/hail/vds/methods.py:623, in filter_intervals(vds, intervals, split_reference_blocks, keep)
621 if split_reference_blocks and not keep:
622 raise ValueError("'filter_intervals': cannot use 'split_reference_blocks' with keep=False")
--> 623 return _parameterized_filter_intervals(
624 vds, intervals, keep=keep, mode='split_at_boundaries' if split_reference_blocks else 'variants_only'
625 )
File <decorator-gen-1866>:2, in _parameterized_filter_intervals(vds, intervals, keep, mode)
File /opt/conda/lib/python3.10/site-packages/hail/typecheck/check.py:585, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
582 @decorator
583 def wrapper(__original_func: Callable[..., T], *args, **kwargs) -> T:
584 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 585 return __original_func(*args_, **kwargs_)
File /opt/conda/lib/python3.10/site-packages/hail/vds/methods.py:495, in _parameterized_filter_intervals(vds, intervals, keep, mode)
489 max_len = hl.eval(vds.reference_data.index_globals()[rbml])
490 ref_intervals = intervals.map(
491 lambda interval: hl.interval(
492 interval.start - (max_len - 1), interval.end, interval.includes_start, interval.includes_end
493 )
494 )
--> 495 reference_data = hl.filter_intervals(reference_data, ref_intervals, keep)
496 else:
497 warning(
498 "'hl.vds.filter_intervals': filtering intervals without a known max reference block length"
499 "\n (computed by `hl.vds.store_ref_block_max_length` or 'hl.vds.truncate_reference_blocks')"
500 "\n requires a full pass over the reference data (expensive!)"
501 )
File <decorator-gen-1632>:2, in filter_intervals(ds, intervals, keep)
File /opt/conda/lib/python3.10/site-packages/hail/typecheck/check.py:585, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
582 @decorator
583 def wrapper(__original_func: Callable[..., T], *args, **kwargs) -> T:
584 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 585 return __original_func(*args_, **kwargs_)
File /opt/conda/lib/python3.10/site-packages/hail/methods/misc.py:415, in filter_intervals(ds, intervals, keep)
412 else:
413 return interval
--> 415 intervals = hl.eval(intervals)
416 intervals = [wrap_input(i) for i in intervals]
418 if isinstance(ds, MatrixTable):
File <decorator-gen-570>:2, in eval(expression)
File /opt/conda/lib/python3.10/site-packages/hail/typecheck/check.py:585, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
582 @decorator
583 def wrapper(__original_func: Callable[..., T], *args, **kwargs) -> T:
584 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 585 return __original_func(*args_, **kwargs_)
File /opt/conda/lib/python3.10/site-packages/hail/expr/expressions/expression_utils.py:194, in eval(expression)
167 @typecheck(expression=expr_any)
168 def eval(expression):
169 """Evaluate a Hail expression, returning the result.
170
171 This method is extremely useful for learning about Hail expressions and
(...)
192 Any
193 """
--> 194 return eval_timed(expression)[0]
File <decorator-gen-568>:2, in eval_timed(expression)
File /opt/conda/lib/python3.10/site-packages/hail/typecheck/check.py:585, in _make_dec.<locals>.wrapper(__original_func, *args, **kwargs)
582 @decorator
583 def wrapper(__original_func: Callable[..., T], *args, **kwargs) -> T:
584 args_, kwargs_ = check_all(__original_func, args, kwargs, checkers, is_method=is_method)
--> 585 return __original_func(*args_, **kwargs_)
File /opt/conda/lib/python3.10/site-packages/hail/expr/expressions/expression_utils.py:164, in eval_timed(expression)
161 uid = Env.get_uid()
162 ir = expression._indices.source.select_globals(**{uid: expression}).index_globals()[uid]._ir
--> 164 return Env.backend().execute(MakeTuple([ir]), timed=True)[0]
File /opt/conda/lib/python3.10/site-packages/hail/backend/spark_backend.py:226, in SparkBackend.execute(self, ir, timed)
223 except Exception as fatal:
224 raise err from fatal
--> 226 raise err
File /opt/conda/lib/python3.10/site-packages/hail/backend/spark_backend.py:218, in SparkBackend.execute(self, ir, timed)
216 def execute(self, ir: BaseIR, timed: bool = False) -> Any:
217 try:
--> 218 return super().execute(ir, timed)
219 except Exception as err:
220 if self._copy_log_on_error:
File /opt/conda/lib/python3.10/site-packages/hail/backend/backend.py:188, in Backend.execute(self, ir, timed)
186 payload = ExecutePayload(self._render_ir(ir), '{"name":"StreamBufferSpec"}', timed)
187 try:
--> 188 result, timings = self._rpc(ActionTag.EXECUTE, payload)
189 except FatalError as e:
190 raise e.maybe_user_error(ir) from None
File /opt/conda/lib/python3.10/site-packages/hail/backend/py4j_backend.py:218, in Py4JBackend._rpc(self, action, payload)
216 path = action_routes[action]
217 port = self._backend_server_port
--> 218 resp = self._requests_session.post(f'http://localhost:{port}{path}', data=data)
219 if resp.status_code >= 400:
220 error_json = orjson.loads(resp.content)
File /opt/conda/lib/python3.10/site-packages/requests/sessions.py:637, in Session.post(self, url, data, json, **kwargs)
626 def post(self, url, data=None, json=None, **kwargs):
627 r"""Sends a POST request. Returns :class:`Response` object.
628
629 :param url: URL for the new :class:`Request` object.
(...)
634 :rtype: requests.Response
635 """
--> 637 return self.request("POST", url, data=data, json=json, **kwargs)
File /opt/conda/lib/python3.10/site-packages/requests/sessions.py:589, in Session.request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
584 send_kwargs = {
585 "timeout": timeout,
586 "allow_redirects": allow_redirects,
587 }
588 send_kwargs.update(settings)
--> 589 resp = self.send(prep, **send_kwargs)
591 return resp
File /opt/conda/lib/python3.10/site-packages/requests/sessions.py:703, in Session.send(self, request, **kwargs)
700 start = preferred_clock()
702 # Send the request
--> 703 r = adapter.send(request, **kwargs)
705 # Total elapsed time of the request (approximately)
706 elapsed = preferred_clock() - start
File /opt/conda/lib/python3.10/site-packages/requests/adapters.py:682, in HTTPAdapter.send(self, request, stream, timeout, verify, cert, proxies)
667 resp = conn.urlopen(
668 method=request.method,
669 url=url,
(...)
678 chunked=chunked,
679 )
681 except (ProtocolError, OSError) as err:
--> 682 raise ConnectionError(err, request=request)
684 except MaxRetryError as e:
685 if isinstance(e.reason, ConnectTimeoutError):
686 # TODO: Remove this in 3.0.0: see #2811
ConnectionError: ('Connection aborted.', LineTooLong('got more than 1048576 bytes when reading header line'))