diff --git a/src/tola/fasta/index.py b/src/tola/fasta/index.py index 3289e7a..d3aeef1 100644 --- a/src/tola/fasta/index.py +++ b/src/tola/fasta/index.py @@ -1,38 +1,150 @@ #!/usr/bin/env python3 import io +import logging import re import sys from pathlib import Path +from tola.assembly.assembly import Assembly +from tola.assembly.format import format_agp +from tola.assembly.fragment import Fragment +from tola.assembly.gap import Gap +from tola.assembly.parser import parse_agp +from tola.assembly.scaffold import Scaffold + + +class IndexUsageError(Exception): + """Unexpected usage of FastaIndex""" + + +class FastaIndex: + __slots__ = "fasta_file", "fai_file", "agp_file", "index", "assembly" + + def __init__(self, fasta_file: Path | str): + if not isinstance(fasta_file, Path): + fasta_file = Path(fasta_file) + if not fasta_file.exists(): + missing = str(fasta_file) + raise FileNotFoundError(missing) + self.fasta_file = fasta_file + self.fai_file = Path(str(fasta_file) + ".fai") + self.agp_file = Path(str(fasta_file) + ".agp") + self.index = None + self.assembly = None + + def auto_load(self): + if self.check_for_index_files(): + self.load_index() + self.load_assembly() + else: + self.run_indexing() + + def check_for_index_files(self): + """ + Check that the .agp and fai files exist and are newer than the FASTA + sequence file. + """ + fasta_mtime = self.fasta_file.stat().st_mtime + for idx_file in self.fai_file, self.agp_file: + if not idx_file.exists(): + return False + if not idx_file.stat().st_mtime > fasta_mtime: + logging.warning( + f"Index file '{idx_file}' is older than" + f" FASTA file '{self.fasta_file}'" + ) + return False + return True + + def load_index(self): + if self.index: + msg = "Index FAI already loaded" + raise IndexUsageError(msg) + + idx_dict = {} + with self.fai_file.open() as idx: + for line in idx: + name, length, file_offset, residues_per_line, max_line_length = ( + line.split() + ) + idx_dict[name] = FastaInfo( + length, + file_offset, + residues_per_line, + max_line_length, + ) + self.index = idx_dict + + def write_index(self): + idx_dict = self.index + if not idx_dict: + msg = "No index data to write to FAI file" + raise IndexUsageError(msg) + if self.fai_file.exists(): + logging.warning(f"Overwriting FAI index file '{self.fai_file}'") + with self.fai_file.open("w") as idx_fh: + for name, info in idx_dict.items(): + idx_fh.write(info.fai_row(name)) + + def load_assembly(self): + if self.assembly: + msg = "Assembly AGP already loaded" + raise IndexUsageError(msg) + self.assembly = parse_agp(self.agp_file.open(), self.fasta_file.name) + + def write_assembly(self): + asm = self.assembly + if not asm: + msg = "No assembly data to write to AGP file" + raise IndexUsageError(msg) + if self.agp_file.exists(): + logging.warning(f"Overwriting AGP assembly file '{self.agp_file}'") + with self.agp_file.open("w") as agp_fh: + format_agp(asm, agp_fh) + + def run_indexing(self): + idx_dict, assembly = index_fasta_file(self.fasta_file) + self.index = idx_dict + self.assembly = assembly + self.write_index() + self.write_assembly() + class FastaInfo: __slots__ = ( - "name", "length", "file_offset", "residues_per_line", "max_line_length", - "seq_regions", ) def __init__( self, - name, length, file_offset, residues_per_line, max_line_length, - seq_regions=None, ): - self.name = name - self.length = length - self.file_offset = file_offset - self.residues_per_line = residues_per_line - self.max_line_length = max_line_length - self.seq_regions = seq_regions - - def fai_row(self): + self.length = int(length) + self.file_offset = int(file_offset) + self.residues_per_line = int(residues_per_line) + self.max_line_length = int(max_line_length) + + def __eq__(self, othr): + for attr in self.__slots__: + if getattr(self, attr) != getattr(othr, attr): + return False + return True + + def __repr__(self): + return ( + "FastaInfo(" + + (", ".join(f"{attr}={getattr(self, attr)!r}" for attr in self.__slots__)) + + ")" + ) + + def fai_row(self, name): """Returns a row for a Fasta Index (.fai) file.""" numbers = "\t".join( str(x) @@ -43,7 +155,7 @@ def fai_row(self): self.max_line_length, ) ) - return f"{self.name}\t{numbers}\n" + return f"{name}\t{numbers}\n" def regions(self): s = io.StringIO() @@ -64,27 +176,47 @@ def index_fasta_file(file: Path, buffer_size: int = 10_000_000): line_end_bytes = None seq_buffer = io.BytesIO() - info = [] + idx_dict = {} + asm = Assembly( + file.name, + header=[f"Built from FASTA file '{file.absolute()}'"], + ) - def load_info(): + def store_info(): process_seq_buffer() if region_end: - seq_regions.append((region_start + 1, region_end)) - info.append( - FastaInfo( - name, - seq_length, - file_offset, - residues_per_line, - residues_per_line + line_end_bytes, - seq_regions, - ) + seq_regions.append((region_start, region_end)) + + if idx_dict.get(name): + msg = f"More than one sequence named '{name}' in FASTA file '{file}'" + raise ValueError(msg) + idx_dict[name] = FastaInfo( + seq_length, + file_offset, + residues_per_line, + residues_per_line + line_end_bytes, ) + scffld = Scaffold(name) + prev = (0, 0) + for region in seq_regions: + start, end = region + if start != prev[1]: + gap_length = start - prev[1] + scffld.add_row(Gap(gap_length, "scaffold")) + scffld.add_row(Fragment(name, start + 1, end, 1)) + prev = region + if rem := seq_length - prev[1]: + scffld.add_row(Gap(rem, "scaffold")) + + asm.add_scaffold(scffld) + def process_seq_buffer(): - nonlocal seq_length - nonlocal region_start - nonlocal region_end + # Outer scope variables which we "rebind" in this function. + # See https://peps.python.org/pep-3104/ for explanation. + nonlocal seq_length, region_start, region_end + + # Take the value from the sequence buffer and empty it seq_bytes = seq_buffer.getvalue() seq_buffer.seek(0) seq_buffer.truncate(0) @@ -97,13 +229,12 @@ def process_seq_buffer(): region_end = end else: if region_end: - seq_regions.append((region_start + 1, region_end)) + seq_regions.append((region_start, region_end)) region_start = start region_end = end seq_length += len(seq_bytes) - # Opening the file in bytes mode means that Windows ("\r\n") or UNIX # ("\n") line endings are preserved. It is also about 10% faster than # decoding to UTF-8. @@ -114,12 +245,12 @@ def process_seq_buffer(): # If this isn't the first sequence in the file, store the # accumulated data from the previous sequence. if name: - load_info() + store_info() # Get new name by splitting on whitespace beyond the first - # character and taking the first element of the array. This - # also allows space characters following the ">" character of - # the header. + # character and taking the first element of the array. + # (This also allows space characters following the ">" + # character of the header.) name = line[1:].split()[0].decode("utf8") if not name: msg = f"Failed to parse sequence name from line:\n{line}" @@ -150,15 +281,18 @@ def process_seq_buffer(): # Store info for the last sequence in the file if name: - load_info() + store_info() - return info + if idx_dict: + return idx_dict, asm + else: + msg = f"No data in FASTA file '{file.absolute()}'" if __name__ == "__main__": for file in sys.argv[1:]: - info = index_fasta_file(Path(file)) - for fst in info: + idx_dict, asm = index_fasta_file(Path(file)) + for name, info in idx_dict.items(): # sys.stdout.write("\n") - sys.stdout.write(fst.fai_row()) + sys.stdout.write(info.fai_row(name)) # sys.stdout.write(fst.regions()) diff --git a/tests/fasta/test.fa.agp b/tests/fasta/test.fa.agp new file mode 100644 index 0000000..0e6b159 --- /dev/null +++ b/tests/fasta/test.fa.agp @@ -0,0 +1,553 @@ +# Built from FASTA file '/Users/jgrg/git/agp-tpf-utils/tests/fasta/test.fa' +RAND-001 1 3 1 U 3 scaffold yes proximity_ligation +RAND-001 4 173 2 W RAND-001 4 173 + +RAND-001 174 176 3 U 3 scaffold yes proximity_ligation +RAND-002 1 73 1 W RAND-002 1 73 + +RAND-002 74 74 2 U 1 scaffold yes proximity_ligation +RAND-002 75 118 3 W RAND-002 75 118 + +RAND-002 119 122 4 U 4 scaffold yes proximity_ligation +RAND-002 123 191 5 W RAND-002 123 191 + +RAND-002 192 192 6 U 1 scaffold yes proximity_ligation +RAND-002 193 198 7 W RAND-002 193 198 + +RAND-002 199 199 8 U 1 scaffold yes proximity_ligation +RAND-002 200 355 9 W RAND-002 200 355 + +RAND-003 1 51 1 W RAND-003 1 51 + +RAND-004 1 65 1 W RAND-004 1 65 + +RAND-004 66 66 2 U 1 scaffold yes proximity_ligation +RAND-004 67 403 3 W RAND-004 67 403 + +RAND-005 1 42 1 W RAND-005 1 42 + +RAND-005 43 43 2 U 1 scaffold yes proximity_ligation +RAND-005 44 63 3 W RAND-005 44 63 + +RAND-005 64 64 4 U 1 scaffold yes proximity_ligation +RAND-005 65 137 5 W RAND-005 65 137 + +RAND-006 1 90 1 W RAND-006 1 90 + +RAND-007 1 25 1 W RAND-007 1 25 + +RAND-007 26 26 2 U 1 scaffold yes proximity_ligation +RAND-007 27 89 3 W RAND-007 27 89 + +RAND-007 90 90 4 U 1 scaffold yes proximity_ligation +RAND-007 91 102 5 W RAND-007 91 102 + +RAND-007 103 103 6 U 1 scaffold yes proximity_ligation +RAND-007 104 109 7 W RAND-007 104 109 + +RAND-007 110 110 8 U 1 scaffold yes proximity_ligation +RAND-007 111 243 9 W RAND-007 111 243 + +RAND-008 1 62 1 W RAND-008 1 62 + +RAND-008 63 63 2 U 1 scaffold yes proximity_ligation +RAND-008 64 74 3 W RAND-008 64 74 + +RAND-008 75 75 4 U 1 scaffold yes proximity_ligation +RAND-008 76 366 5 W RAND-008 76 366 + +RAND-009 1 46 1 W RAND-009 1 46 + +RAND-009 47 47 2 U 1 scaffold yes proximity_ligation +RAND-009 48 99 3 W RAND-009 48 99 + +RAND-009 100 100 4 U 1 scaffold yes proximity_ligation +RAND-009 101 247 5 W RAND-009 101 247 + +RAND-009 248 248 6 U 1 scaffold yes proximity_ligation +RAND-009 249 263 7 W RAND-009 249 263 + +RAND-009 264 264 8 U 1 scaffold yes proximity_ligation +RAND-009 265 277 9 W RAND-009 265 277 + +RAND-009 278 278 10 U 1 scaffold yes proximity_ligation +RAND-009 279 394 11 W RAND-009 279 394 + +RAND-010 1 133 1 W RAND-010 1 133 + +RAND-010 134 134 2 U 1 scaffold yes proximity_ligation +RAND-010 135 325 3 W RAND-010 135 325 + +RAND-011 1 9 1 W RAND-011 1 9 + +RAND-011 10 10 2 U 1 scaffold yes proximity_ligation +RAND-011 11 111 3 W RAND-011 11 111 + +RAND-011 112 112 4 U 1 scaffold yes proximity_ligation +RAND-011 113 384 5 W RAND-011 113 384 + +RAND-011 385 385 6 U 1 scaffold yes proximity_ligation +RAND-011 386 414 7 W RAND-011 386 414 + +RAND-012 1 66 1 W RAND-012 1 66 + +RAND-013 1 1 1 W RAND-013 1 1 + +RAND-013 2 2 2 U 1 scaffold yes proximity_ligation +RAND-013 3 3 3 W RAND-013 3 3 + +RAND-013 4 4 4 U 1 scaffold yes proximity_ligation +RAND-013 5 44 5 W RAND-013 5 44 + +RAND-013 45 45 6 U 1 scaffold yes proximity_ligation +RAND-013 46 53 7 W RAND-013 46 53 + +RAND-013 54 54 8 U 1 scaffold yes proximity_ligation +RAND-013 55 242 9 W RAND-013 55 242 + +RAND-013 243 243 10 U 1 scaffold yes proximity_ligation +RAND-013 244 511 11 W RAND-013 244 511 + +RAND-013 512 512 12 U 1 scaffold yes proximity_ligation +RAND-013 513 598 13 W RAND-013 513 598 + +RAND-014 1 41 1 W RAND-014 1 41 + +RAND-014 42 42 2 U 1 scaffold yes proximity_ligation +RAND-014 43 279 3 W RAND-014 43 279 + +RAND-014 280 280 4 U 1 scaffold yes proximity_ligation +RAND-014 281 366 5 W RAND-014 281 366 + +RAND-015 1 87 1 W RAND-015 1 87 + +RAND-016 1 71 1 W RAND-016 1 71 + +RAND-016 72 72 2 U 1 scaffold yes proximity_ligation +RAND-016 73 89 3 W RAND-016 73 89 + +RAND-016 90 90 4 U 1 scaffold yes proximity_ligation +RAND-016 91 120 5 W RAND-016 91 120 + +RAND-017 1 75 1 W RAND-017 1 75 + +RAND-017 76 76 2 U 1 scaffold yes proximity_ligation +RAND-017 77 233 3 W RAND-017 77 233 + +RAND-017 234 234 4 U 1 scaffold yes proximity_ligation +RAND-017 235 332 5 W RAND-017 235 332 + +RAND-017 333 333 6 U 1 scaffold yes proximity_ligation +RAND-017 334 339 7 W RAND-017 334 339 + +RAND-017 340 340 8 U 1 scaffold yes proximity_ligation +RAND-017 341 399 9 W RAND-017 341 399 + +RAND-017 400 400 10 U 1 scaffold yes proximity_ligation +RAND-017 401 509 11 W RAND-017 401 509 + +RAND-017 510 510 12 U 1 scaffold yes proximity_ligation +RAND-017 511 573 13 W RAND-017 511 573 + +RAND-018 1 36 1 W RAND-018 1 36 + +RAND-018 37 37 2 U 1 scaffold yes proximity_ligation +RAND-018 38 53 3 W RAND-018 38 53 + +RAND-019 1 39 1 W RAND-019 1 39 + +RAND-020 1 115 1 W RAND-020 1 115 + +RAND-020 116 116 2 U 1 scaffold yes proximity_ligation +RAND-020 117 478 3 W RAND-020 117 478 + +RAND-020 479 479 4 U 1 scaffold yes proximity_ligation +RAND-020 480 500 5 W RAND-020 480 500 + +RAND-020 501 501 6 U 1 scaffold yes proximity_ligation +RAND-020 502 514 7 W RAND-020 502 514 + +RAND-021 1 19 1 W RAND-021 1 19 + +RAND-022 1 29 1 W RAND-022 1 29 + +RAND-023 1 199 1 W RAND-023 1 199 + +RAND-023 200 200 2 U 1 scaffold yes proximity_ligation +RAND-023 201 302 3 W RAND-023 201 302 + +RAND-023 303 303 4 U 1 scaffold yes proximity_ligation +RAND-023 304 376 5 W RAND-023 304 376 + +RAND-023 377 377 6 U 1 scaffold yes proximity_ligation +RAND-023 378 411 7 W RAND-023 378 411 + +RAND-024 1 19 1 W RAND-024 1 19 + +RAND-024 20 20 2 U 1 scaffold yes proximity_ligation +RAND-024 21 42 3 W RAND-024 21 42 + +RAND-025 1 23 1 W RAND-025 1 23 + +RAND-025 24 24 2 U 1 scaffold yes proximity_ligation +RAND-025 25 93 3 W RAND-025 25 93 + +RAND-025 94 94 4 U 1 scaffold yes proximity_ligation +RAND-025 95 233 5 W RAND-025 95 233 + +RAND-025 234 234 6 U 1 scaffold yes proximity_ligation +RAND-025 235 389 7 W RAND-025 235 389 + +RAND-025 390 390 8 U 1 scaffold yes proximity_ligation +RAND-025 391 504 9 W RAND-025 391 504 + +RAND-025 505 505 10 U 1 scaffold yes proximity_ligation +RAND-025 506 545 11 W RAND-025 506 545 + +RAND-025 546 546 12 U 1 scaffold yes proximity_ligation +RAND-025 547 582 13 W RAND-025 547 582 + +RAND-026 1 95 1 W RAND-026 1 95 + +RAND-027 1 11 1 W RAND-027 1 11 + +RAND-027 12 12 2 U 1 scaffold yes proximity_ligation +RAND-027 13 145 3 W RAND-027 13 145 + +RAND-027 146 146 4 U 1 scaffold yes proximity_ligation +RAND-027 147 378 5 W RAND-027 147 378 + +RAND-027 379 379 6 U 1 scaffold yes proximity_ligation +RAND-027 380 414 7 W RAND-027 380 414 + +RAND-027 415 415 8 U 1 scaffold yes proximity_ligation +RAND-027 416 586 9 W RAND-027 416 586 + +RAND-028 1 28 1 W RAND-028 1 28 + +RAND-028 29 29 2 U 1 scaffold yes proximity_ligation +RAND-028 30 96 3 W RAND-028 30 96 + +RAND-028 97 97 4 U 1 scaffold yes proximity_ligation +RAND-028 98 239 5 W RAND-028 98 239 + +RAND-028 240 240 6 U 1 scaffold yes proximity_ligation +RAND-028 241 272 7 W RAND-028 241 272 + +RAND-028 273 273 8 U 1 scaffold yes proximity_ligation +RAND-028 274 281 9 W RAND-028 274 281 + +RAND-028 282 282 10 U 1 scaffold yes proximity_ligation +RAND-028 283 305 11 W RAND-028 283 305 + +RAND-028 306 306 12 U 1 scaffold yes proximity_ligation +RAND-028 307 346 13 W RAND-028 307 346 + +RAND-028 347 347 14 U 1 scaffold yes proximity_ligation +RAND-028 348 448 15 W RAND-028 348 448 + +RAND-028 449 449 16 U 1 scaffold yes proximity_ligation +RAND-028 450 455 17 W RAND-028 450 455 + +RAND-028 456 456 18 U 1 scaffold yes proximity_ligation +RAND-028 457 540 19 W RAND-028 457 540 + +RAND-028 541 541 20 U 1 scaffold yes proximity_ligation +RAND-028 542 590 21 W RAND-028 542 590 + +RAND-029 1 71 1 W RAND-029 1 71 + +RAND-029 72 72 2 U 1 scaffold yes proximity_ligation +RAND-029 73 84 3 W RAND-029 73 84 + +RAND-030 1 219 1 W RAND-030 1 219 + +RAND-030 220 220 2 U 1 scaffold yes proximity_ligation +RAND-030 221 329 3 W RAND-030 221 329 + +RAND-030 330 330 4 U 1 scaffold yes proximity_ligation +RAND-030 331 418 5 W RAND-030 331 418 + +RAND-031 1 68 1 W RAND-031 1 68 + +RAND-031 69 69 2 U 1 scaffold yes proximity_ligation +RAND-031 70 84 3 W RAND-031 70 84 + +RAND-031 85 85 4 U 1 scaffold yes proximity_ligation +RAND-031 86 247 5 W RAND-031 86 247 + +RAND-032 1 8 1 W RAND-032 1 8 + +RAND-032 9 9 2 U 1 scaffold yes proximity_ligation +RAND-032 10 224 3 W RAND-032 10 224 + +RAND-032 225 225 4 U 1 scaffold yes proximity_ligation +RAND-032 226 349 5 W RAND-032 226 349 + +RAND-032 350 350 6 U 1 scaffold yes proximity_ligation +RAND-032 351 375 7 W RAND-032 351 375 + +RAND-032 376 376 8 U 1 scaffold yes proximity_ligation +RAND-032 377 408 9 W RAND-032 377 408 + +RAND-032 409 409 10 U 1 scaffold yes proximity_ligation +RAND-032 410 488 11 W RAND-032 410 488 + +RAND-032 489 490 12 U 2 scaffold yes proximity_ligation +RAND-032 491 509 13 W RAND-032 491 509 + +RAND-033 1 52 1 W RAND-033 1 52 + +RAND-033 53 53 2 U 1 scaffold yes proximity_ligation +RAND-033 54 283 3 W RAND-033 54 283 + +RAND-033 284 284 4 U 1 scaffold yes proximity_ligation +RAND-033 285 327 5 W RAND-033 285 327 + +RAND-033 328 328 6 U 1 scaffold yes proximity_ligation +RAND-033 329 388 7 W RAND-033 329 388 + +RAND-033 389 389 8 U 1 scaffold yes proximity_ligation +RAND-033 390 437 9 W RAND-033 390 437 + +RAND-033 438 438 10 U 1 scaffold yes proximity_ligation +RAND-033 439 503 11 W RAND-033 439 503 + +RAND-033 504 504 12 U 1 scaffold yes proximity_ligation +RAND-033 505 505 13 W RAND-033 505 505 + +RAND-034 1 56 1 W RAND-034 1 56 + +RAND-034 57 57 2 U 1 scaffold yes proximity_ligation +RAND-034 58 69 3 W RAND-034 58 69 + +RAND-034 70 70 4 U 1 scaffold yes proximity_ligation +RAND-034 71 282 5 W RAND-034 71 282 + +RAND-034 283 283 6 U 1 scaffold yes proximity_ligation +RAND-034 284 326 7 W RAND-034 284 326 + +RAND-034 327 327 8 U 1 scaffold yes proximity_ligation +RAND-034 328 570 9 W RAND-034 328 570 + +RAND-034 571 571 10 U 1 scaffold yes proximity_ligation +RAND-034 572 583 11 W RAND-034 572 583 + +RAND-035 1 53 1 W RAND-035 1 53 + +RAND-035 54 54 2 U 1 scaffold yes proximity_ligation +RAND-035 55 102 3 W RAND-035 55 102 + +RAND-035 103 103 4 U 1 scaffold yes proximity_ligation +RAND-035 104 148 5 W RAND-035 104 148 + +RAND-036 1 59 1 W RAND-036 1 59 + +RAND-037 1 15 1 W RAND-037 1 15 + +RAND-037 16 16 2 U 1 scaffold yes proximity_ligation +RAND-037 17 108 3 W RAND-037 17 108 + +RAND-037 109 109 4 U 1 scaffold yes proximity_ligation +RAND-037 110 142 5 W RAND-037 110 142 + +RAND-038 1 113 1 W RAND-038 1 113 + +RAND-038 114 114 2 U 1 scaffold yes proximity_ligation +RAND-038 115 170 3 W RAND-038 115 170 + +RAND-038 171 171 4 U 1 scaffold yes proximity_ligation +RAND-038 172 439 5 W RAND-038 172 439 + +RAND-039 1 160 1 W RAND-039 1 160 + +RAND-039 161 161 2 U 1 scaffold yes proximity_ligation +RAND-039 162 188 3 W RAND-039 162 188 + +RAND-039 189 189 4 U 1 scaffold yes proximity_ligation +RAND-039 190 316 5 W RAND-039 190 316 + +RAND-039 317 317 6 U 1 scaffold yes proximity_ligation +RAND-039 318 408 7 W RAND-039 318 408 + +RAND-039 409 409 8 U 1 scaffold yes proximity_ligation +RAND-039 410 507 9 W RAND-039 410 507 + +RAND-040 1 63 1 W RAND-040 1 63 + +RAND-041 1 9 1 W RAND-041 1 9 + +RAND-041 10 10 2 U 1 scaffold yes proximity_ligation +RAND-041 11 150 3 W RAND-041 11 150 + +RAND-041 151 151 4 U 1 scaffold yes proximity_ligation +RAND-041 152 175 5 W RAND-041 152 175 + +RAND-041 176 176 6 U 1 scaffold yes proximity_ligation +RAND-041 177 188 7 W RAND-041 177 188 + +RAND-041 189 189 8 U 1 scaffold yes proximity_ligation +RAND-041 190 279 9 W RAND-041 190 279 + +RAND-041 280 280 10 U 1 scaffold yes proximity_ligation +RAND-041 281 307 11 W RAND-041 281 307 + +RAND-041 308 308 12 U 1 scaffold yes proximity_ligation +RAND-041 309 370 13 W RAND-041 309 370 + +RAND-042 1 15 1 W RAND-042 1 15 + +RAND-042 16 16 2 U 1 scaffold yes proximity_ligation +RAND-042 17 17 3 W RAND-042 17 17 + +RAND-042 18 18 4 U 1 scaffold yes proximity_ligation +RAND-042 19 91 5 W RAND-042 19 91 + +RAND-042 92 92 6 U 1 scaffold yes proximity_ligation +RAND-042 93 305 7 W RAND-042 93 305 + +RAND-042 306 306 8 U 1 scaffold yes proximity_ligation +RAND-042 307 440 9 W RAND-042 307 440 + +RAND-043 1 4 1 W RAND-043 1 4 + +RAND-043 5 5 2 U 1 scaffold yes proximity_ligation +RAND-043 6 475 3 W RAND-043 6 475 + +RAND-043 476 476 4 U 1 scaffold yes proximity_ligation +RAND-043 477 561 5 W RAND-043 477 561 + +RAND-044 1 5 1 W RAND-044 1 5 + +RAND-044 6 6 2 U 1 scaffold yes proximity_ligation +RAND-044 7 289 3 W RAND-044 7 289 + +RAND-045 1 106 1 W RAND-045 1 106 + +RAND-045 107 107 2 U 1 scaffold yes proximity_ligation +RAND-045 108 119 3 W RAND-045 108 119 + +RAND-046 1 18 1 W RAND-046 1 18 + +RAND-047 1 16 1 W RAND-047 1 16 + +RAND-048 1 95 1 W RAND-048 1 95 + +RAND-048 96 96 2 U 1 scaffold yes proximity_ligation +RAND-048 97 112 3 W RAND-048 97 112 + +RAND-048 113 113 4 U 1 scaffold yes proximity_ligation +RAND-048 114 223 5 W RAND-048 114 223 + +RAND-048 224 224 6 U 1 scaffold yes proximity_ligation +RAND-048 225 245 7 W RAND-048 225 245 + +RAND-049 1 41 1 W RAND-049 1 41 + +RAND-049 42 42 2 U 1 scaffold yes proximity_ligation +RAND-049 43 70 3 W RAND-049 43 70 + +RAND-050 1 36 1 W RAND-050 1 36 + +RAND-051 1 117 1 W RAND-051 1 117 + +RAND-051 118 118 2 U 1 scaffold yes proximity_ligation +RAND-051 119 120 3 W RAND-051 119 120 + +RAND-051 121 121 4 U 1 scaffold yes proximity_ligation +RAND-051 122 156 5 W RAND-051 122 156 + +RAND-051 157 157 6 U 1 scaffold yes proximity_ligation +RAND-051 158 219 7 W RAND-051 158 219 + +RAND-051 220 220 8 U 1 scaffold yes proximity_ligation +RAND-051 221 313 9 W RAND-051 221 313 + +RAND-051 314 314 10 U 1 scaffold yes proximity_ligation +RAND-051 315 316 11 W RAND-051 315 316 + +RAND-051 317 317 12 U 1 scaffold yes proximity_ligation +RAND-051 318 345 13 W RAND-051 318 345 + +RAND-051 346 346 14 U 1 scaffold yes proximity_ligation +RAND-051 347 418 15 W RAND-051 347 418 + +RAND-052 1 456 1 W RAND-052 1 456 + +RAND-052 457 457 2 U 1 scaffold yes proximity_ligation +RAND-052 458 461 3 W RAND-052 458 461 + +RAND-053 1 47 1 W RAND-053 1 47 + +RAND-054 1 313 1 W RAND-054 1 313 + +RAND-054 314 314 2 U 1 scaffold yes proximity_ligation +RAND-054 315 328 3 W RAND-054 315 328 + +RAND-054 329 329 4 U 1 scaffold yes proximity_ligation +RAND-054 330 361 5 W RAND-054 330 361 + +RAND-054 362 362 6 U 1 scaffold yes proximity_ligation +RAND-054 363 450 7 W RAND-054 363 450 + +RAND-054 451 451 8 U 1 scaffold yes proximity_ligation +RAND-054 452 452 9 W RAND-054 452 452 + +RAND-054 453 453 10 U 1 scaffold yes proximity_ligation +RAND-054 454 454 11 W RAND-054 454 454 + +RAND-055 1 175 1 W RAND-055 1 175 + +RAND-055 176 176 2 U 1 scaffold yes proximity_ligation +RAND-055 177 312 3 W RAND-055 177 312 + +RAND-055 313 313 4 U 1 scaffold yes proximity_ligation +RAND-055 314 349 5 W RAND-055 314 349 + +RAND-055 350 350 6 U 1 scaffold yes proximity_ligation +RAND-055 351 357 7 W RAND-055 351 357 + +RAND-055 358 358 8 U 1 scaffold yes proximity_ligation +RAND-055 359 361 9 W RAND-055 359 361 + +RAND-055 362 362 10 U 1 scaffold yes proximity_ligation +RAND-055 363 402 11 W RAND-055 363 402 + +RAND-055 403 403 12 U 1 scaffold yes proximity_ligation +RAND-055 404 468 13 W RAND-055 404 468 + +RAND-055 469 469 14 U 1 scaffold yes proximity_ligation +RAND-055 470 503 15 W RAND-055 470 503 + +RAND-055 504 504 16 U 1 scaffold yes proximity_ligation +RAND-055 505 515 17 W RAND-055 505 515 + +RAND-056 1 159 1 W RAND-056 1 159 + +RAND-056 160 160 2 U 1 scaffold yes proximity_ligation +RAND-056 161 335 3 W RAND-056 161 335 + +RAND-056 336 336 4 U 1 scaffold yes proximity_ligation +RAND-056 337 345 5 W RAND-056 337 345 + +RAND-056 346 346 6 U 1 scaffold yes proximity_ligation +RAND-056 347 446 7 W RAND-056 347 446 + +RAND-057 1 123 1 W RAND-057 1 123 + +RAND-057 124 124 2 U 1 scaffold yes proximity_ligation +RAND-057 125 163 3 W RAND-057 125 163 + +RAND-057 164 164 4 U 1 scaffold yes proximity_ligation +RAND-057 165 300 5 W RAND-057 165 300 + +RAND-057 301 301 6 U 1 scaffold yes proximity_ligation +RAND-057 302 324 7 W RAND-057 302 324 + +RAND-058 1 27 1 W RAND-058 1 27 + +RAND-059 1 148 1 W RAND-059 1 148 + +RAND-059 149 149 2 U 1 scaffold yes proximity_ligation +RAND-059 150 297 3 W RAND-059 150 297 + +RAND-060 1 16 1 W RAND-060 1 16 + +RAND-060 17 17 2 U 1 scaffold yes proximity_ligation +RAND-060 18 92 3 W RAND-060 18 92 + +RAND-060 93 93 4 U 1 scaffold yes proximity_ligation +RAND-060 94 100 5 W RAND-060 94 100 + +RAND-061 1 10 1 W RAND-061 1 10 + +RAND-062 1 13 1 W RAND-062 1 13 + +RAND-063 1 9 1 W RAND-063 1 9 + +RAND-063 10 10 2 U 1 scaffold yes proximity_ligation +RAND-063 11 77 3 W RAND-063 11 77 + +RAND-063 78 78 4 U 1 scaffold yes proximity_ligation +RAND-063 79 94 5 W RAND-063 79 94 + +RAND-063 95 95 6 U 1 scaffold yes proximity_ligation +RAND-063 96 97 7 W RAND-063 96 97 + +RAND-064 1 55 1 W RAND-064 1 55 + +RAND-064 56 56 2 U 1 scaffold yes proximity_ligation +RAND-064 57 195 3 W RAND-064 57 195 + +RAND-064 196 196 4 U 1 scaffold yes proximity_ligation +RAND-064 197 207 5 W RAND-064 197 207 + +RAND-064 208 208 6 U 1 scaffold yes proximity_ligation +RAND-064 209 235 7 W RAND-064 209 235 + +RAND-065 1 13 1 W RAND-065 1 13 + +RAND-065 14 14 2 U 1 scaffold yes proximity_ligation +RAND-065 15 19 3 W RAND-065 15 19 + +RAND-065 20 20 4 U 1 scaffold yes proximity_ligation +RAND-065 21 55 5 W RAND-065 21 55 + +RAND-065 56 56 6 U 1 scaffold yes proximity_ligation +RAND-065 57 80 7 W RAND-065 57 80 + +RAND-066 1 17 1 W RAND-066 1 17 + +RAND-066 18 18 2 U 1 scaffold yes proximity_ligation +RAND-066 19 45 3 W RAND-066 19 45 + +RAND-066 46 46 4 U 1 scaffold yes proximity_ligation +RAND-066 47 92 5 W RAND-066 47 92 + +RAND-067 1 272 1 W RAND-067 1 272 + +RAND-068 1 93 1 W RAND-068 1 93 + +RAND-069 1 130 1 W RAND-069 1 130 + +RAND-069 131 131 2 U 1 scaffold yes proximity_ligation +RAND-069 132 201 3 W RAND-069 132 201 + +RAND-070 1 39 1 W RAND-070 1 39 + +RAND-070 40 40 2 U 1 scaffold yes proximity_ligation +RAND-070 41 88 3 W RAND-070 41 88 + +RAND-071 1 42 1 W RAND-071 1 42 + +RAND-072 1 381 1 W RAND-072 1 381 + +RAND-072 382 382 2 U 1 scaffold yes proximity_ligation +RAND-072 383 456 3 W RAND-072 383 456 + +RAND-072 457 457 4 U 1 scaffold yes proximity_ligation +RAND-072 458 462 5 W RAND-072 458 462 + +RAND-073 1 77 1 W RAND-073 1 77 + +RAND-073 78 79 2 U 2 scaffold yes proximity_ligation +RAND-073 80 325 3 W RAND-073 80 325 + +RAND-073 326 326 4 U 1 scaffold yes proximity_ligation +RAND-073 327 336 5 W RAND-073 327 336 + +RAND-073 337 337 6 U 1 scaffold yes proximity_ligation +RAND-073 338 339 7 W RAND-073 338 339 + +RAND-073 340 340 8 U 1 scaffold yes proximity_ligation +RAND-073 341 361 9 W RAND-073 341 361 + +RAND-073 362 362 10 U 1 scaffold yes proximity_ligation +RAND-073 363 364 11 W RAND-073 363 364 + +RAND-073 365 365 12 U 1 scaffold yes proximity_ligation +RAND-073 366 471 13 W RAND-073 366 471 + +RAND-073 472 472 14 U 1 scaffold yes proximity_ligation +RAND-073 473 500 15 W RAND-073 473 500 + +RAND-074 1 172 1 W RAND-074 1 172 + +RAND-074 173 173 2 U 1 scaffold yes proximity_ligation +RAND-074 174 215 3 W RAND-074 174 215 + +RAND-075 1 36 1 W RAND-075 1 36 + +RAND-075 37 37 2 U 1 scaffold yes proximity_ligation +RAND-075 38 221 3 W RAND-075 38 221 + +RAND-075 222 222 4 U 1 scaffold yes proximity_ligation +RAND-075 223 304 5 W RAND-075 223 304 + +RAND-075 305 305 6 U 1 scaffold yes proximity_ligation +RAND-075 306 404 7 W RAND-075 306 404 + +RAND-076 1 146 1 W RAND-076 1 146 + +RAND-076 147 147 2 U 1 scaffold yes proximity_ligation +RAND-076 148 190 3 W RAND-076 148 190 + +RAND-076 191 191 4 U 1 scaffold yes proximity_ligation +RAND-076 192 228 5 W RAND-076 192 228 + +RAND-076 229 229 6 U 1 scaffold yes proximity_ligation +RAND-076 230 266 7 W RAND-076 230 266 + +RAND-076 267 267 8 U 1 scaffold yes proximity_ligation +RAND-076 268 308 9 W RAND-076 268 308 + +RAND-077 1 11 1 W RAND-077 1 11 + +RAND-077 12 12 2 U 1 scaffold yes proximity_ligation +RAND-077 13 71 3 W RAND-077 13 71 + +RAND-078 1 80 1 W RAND-078 1 80 + +RAND-078 81 81 2 U 1 scaffold yes proximity_ligation +RAND-078 82 177 3 W RAND-078 82 177 + +RAND-079 1 13 1 W RAND-079 1 13 + +RAND-079 14 14 2 U 1 scaffold yes proximity_ligation +RAND-079 15 308 3 W RAND-079 15 308 + +RAND-079 309 309 4 U 1 scaffold yes proximity_ligation +RAND-079 310 457 5 W RAND-079 310 457 + +RAND-080 1 271 1 W RAND-080 1 271 + +RAND-080 272 272 2 U 1 scaffold yes proximity_ligation +RAND-080 273 283 3 W RAND-080 273 283 + +RAND-080 284 284 4 U 1 scaffold yes proximity_ligation +RAND-080 285 298 5 W RAND-080 285 298 + +RAND-081 1 46 1 W RAND-081 1 46 + +RAND-081 47 47 2 U 1 scaffold yes proximity_ligation +RAND-081 48 68 3 W RAND-081 48 68 + +RAND-081 69 69 4 U 1 scaffold yes proximity_ligation +RAND-081 70 103 5 W RAND-081 70 103 + +RAND-081 104 104 6 U 1 scaffold yes proximity_ligation +RAND-081 105 114 7 W RAND-081 105 114 + +RAND-081 115 115 8 U 1 scaffold yes proximity_ligation +RAND-081 116 225 9 W RAND-081 116 225 + +RAND-081 226 226 10 U 1 scaffold yes proximity_ligation +RAND-081 227 237 11 W RAND-081 227 237 + +RAND-081 238 238 12 U 1 scaffold yes proximity_ligation +RAND-081 239 369 13 W RAND-081 239 369 + +RAND-081 370 370 14 U 1 scaffold yes proximity_ligation +RAND-081 371 417 15 W RAND-081 371 417 + +RAND-082 1 210 1 W RAND-082 1 210 + +RAND-082 211 211 2 U 1 scaffold yes proximity_ligation +RAND-082 212 328 3 W RAND-082 212 328 + +RAND-083 1 327 1 W RAND-083 1 327 + +RAND-083 328 328 2 U 1 scaffold yes proximity_ligation +RAND-083 329 375 3 W RAND-083 329 375 + +RAND-083 376 376 4 U 1 scaffold yes proximity_ligation +RAND-083 377 389 5 W RAND-083 377 389 + +RAND-084 1 119 1 W RAND-084 1 119 + +RAND-084 120 120 2 U 1 scaffold yes proximity_ligation +RAND-084 121 133 3 W RAND-084 121 133 + +RAND-085 1 148 1 W RAND-085 1 148 + +RAND-085 149 149 2 U 1 scaffold yes proximity_ligation +RAND-085 150 245 3 W RAND-085 150 245 + +RAND-085 246 246 4 U 1 scaffold yes proximity_ligation +RAND-085 247 333 5 W RAND-085 247 333 + +RAND-086 1 13 1 W RAND-086 1 13 + +RAND-086 14 14 2 U 1 scaffold yes proximity_ligation +RAND-086 15 157 3 W RAND-086 15 157 + +RAND-086 158 158 4 U 1 scaffold yes proximity_ligation +RAND-086 159 256 5 W RAND-086 159 256 + +RAND-086 257 257 6 U 1 scaffold yes proximity_ligation +RAND-086 258 259 7 W RAND-086 258 259 + +RAND-087 1 155 1 W RAND-087 1 155 + +RAND-087 156 156 2 U 1 scaffold yes proximity_ligation +RAND-087 157 278 3 W RAND-087 157 278 + +RAND-087 279 279 4 U 1 scaffold yes proximity_ligation +RAND-087 280 325 5 W RAND-087 280 325 + +RAND-087 326 326 6 U 1 scaffold yes proximity_ligation +RAND-087 327 358 7 W RAND-087 327 358 + +RAND-088 1 5 1 W RAND-088 1 5 + +RAND-088 6 6 2 U 1 scaffold yes proximity_ligation +RAND-088 7 24 3 W RAND-088 7 24 + +RAND-088 25 25 4 U 1 scaffold yes proximity_ligation +RAND-088 26 86 5 W RAND-088 26 86 + +RAND-088 87 87 6 U 1 scaffold yes proximity_ligation +RAND-088 88 100 7 W RAND-088 88 100 + +RAND-089 1 23 1 W RAND-089 1 23 + +RAND-089 24 24 2 U 1 scaffold yes proximity_ligation +RAND-089 25 38 3 W RAND-089 25 38 + +RAND-089 39 39 4 U 1 scaffold yes proximity_ligation +RAND-089 40 60 5 W RAND-089 40 60 + +RAND-089 61 61 6 U 1 scaffold yes proximity_ligation +RAND-089 62 111 7 W RAND-089 62 111 + +RAND-089 112 112 8 U 1 scaffold yes proximity_ligation +RAND-089 113 188 9 W RAND-089 113 188 + +RAND-090 1 14 1 W RAND-090 1 14 + +RAND-090 15 15 2 U 1 scaffold yes proximity_ligation +RAND-090 16 70 3 W RAND-090 16 70 + +RAND-090 71 71 4 U 1 scaffold yes proximity_ligation +RAND-090 72 217 5 W RAND-090 72 217 + +RAND-090 218 218 6 U 1 scaffold yes proximity_ligation +RAND-090 219 230 7 W RAND-090 219 230 + +RAND-091 1 69 1 W RAND-091 1 69 + +RAND-091 70 70 2 U 1 scaffold yes proximity_ligation +RAND-091 71 164 3 W RAND-091 71 164 + +RAND-092 1 305 1 W RAND-092 1 305 + +RAND-092 306 306 2 U 1 scaffold yes proximity_ligation +RAND-092 307 406 3 W RAND-092 307 406 + +RAND-093 1 14 1 W RAND-093 1 14 + +RAND-093 15 15 2 U 1 scaffold yes proximity_ligation +RAND-093 16 154 3 W RAND-093 16 154 + +RAND-093 155 155 4 U 1 scaffold yes proximity_ligation +RAND-093 156 252 5 W RAND-093 156 252 + +RAND-093 253 253 6 U 1 scaffold yes proximity_ligation +RAND-093 254 378 7 W RAND-093 254 378 + +RAND-093 379 379 8 U 1 scaffold yes proximity_ligation +RAND-093 380 413 9 W RAND-093 380 413 + +RAND-093 414 414 10 U 1 scaffold yes proximity_ligation +RAND-093 415 426 11 W RAND-093 415 426 + +RAND-094 1 243 1 W RAND-094 1 243 + +RAND-095 1 84 1 W RAND-095 1 84 + +RAND-096 1 15 1 W RAND-096 1 15 + +RAND-096 16 16 2 U 1 scaffold yes proximity_ligation +RAND-096 17 37 3 W RAND-096 17 37 + +RAND-097 1 152 1 W RAND-097 1 152 + +RAND-097 153 153 2 U 1 scaffold yes proximity_ligation +RAND-097 154 496 3 W RAND-097 154 496 + +RAND-098 1 214 1 W RAND-098 1 214 + +RAND-098 215 215 2 U 1 scaffold yes proximity_ligation +RAND-098 216 256 3 W RAND-098 216 256 + +RAND-098 257 257 4 U 1 scaffold yes proximity_ligation +RAND-098 258 345 5 W RAND-098 258 345 + +RAND-098 346 346 6 U 1 scaffold yes proximity_ligation +RAND-098 347 355 7 W RAND-098 347 355 + +RAND-099 1 298 1 W RAND-099 1 298 + +RAND-099 299 299 2 U 1 scaffold yes proximity_ligation +RAND-099 300 392 3 W RAND-099 300 392 + +RAND-100 1 3 1 W RAND-100 1 3 + +RAND-100 4 4 2 U 1 scaffold yes proximity_ligation +RAND-100 5 27 3 W RAND-100 5 27 + diff --git a/tests/fasta_test.py b/tests/fasta_test.py index 1a6fcec..ad44561 100644 --- a/tests/fasta_test.py +++ b/tests/fasta_test.py @@ -2,7 +2,7 @@ import pytest -from tola.fasta.index import index_fasta_file +from tola.fasta.index import FastaIndex, index_fasta_file def list_fasta_files(): @@ -14,11 +14,10 @@ def list_fasta_files(): @pytest.mark.parametrize("fasta_file", list_fasta_files()) def test_fai(fasta_file): - fai_file = pathlib.Path(str(fasta_file) + ".fai") - if not fai_file.exists(): - msg = f"Missing expected '.fai' file: {fai_file}" - raise ValueError(msg) - fai_str = fai_file.read_text() - info = index_fasta_file(fasta_file) - test_str = "".join(x.fai_row() for x in info) - assert test_str == fai_str + idx = FastaIndex(fasta_file) + idx.load_index() + idx_dict, asm = index_fasta_file(fasta_file) + assert idx_dict == idx.index + idx.load_assembly() + asm.header = idx.assembly.header = [] + assert str(asm) == str(idx.assembly)