Source code for datalad_tabby.io.xlsx

"""Conversion of `tabby` TSV files to and from XLSX multi-sheet spreadsheets
"""

from __future__ import annotations

import csv
from pathlib import Path
from typing import List

from openpyxl import (
    Workbook,
    load_workbook,
)
from openpyxl.worksheet.worksheet import Worksheet

__all__ = ['xlsx2tabby', 'tabby2xlsx']


[docs] def xlsx2tabby(src: Path, dest: Path) -> List[Path]: """Convert a tabby record in a (multi-sheet) XLSX file to a series of TSVs The TSV file collection will use the name of the XLSX file (without extension) as name prefix and append ``_<sheetname>.tsv`` it to build the full output filename in the ``dest`` directory. """ tabby_prefix = src.stem wb = load_workbook( filename=src, # see https://openpyxl.readthedocs.io/optimized.html#read-only-mode read_only=True, ) # TODO we could add support for handling single-sheet workbooks regardless # of the name of the sheet (force is to be 'dataset') outfpaths = [] for sheet in wb.sheetnames: outfpath = dest / f"{tabby_prefix}_{sheet}.tsv" _sheet2tsv(wb[sheet], outfpath) outfpaths.append(outfpath) return outfpaths
[docs] def tabby2xlsx(src: Path, dest: Path) -> Path: """Convert a tabby record as a series of TSVs to a (multi-sheet) XLSX file The XLSX file will use the name of the TSV collection (without extension and the sheet name) and append ``.xslx`` to build the full output filename in the ``dest`` directory. """ tabby_prefix = src.stem if tabby_prefix.endswith('_dataset'): # this is the root sheet, strip sheet name to discover all # other components tabby_prefix = tabby_prefix[:-8] sheets_paths = list(src.parent.glob(f'{tabby_prefix}_*.tsv')) sheet_names = [ # the sheet name is the last file name component ('_' delimiter) # without the '.tsv' extension p.name.split('_')[-1][:-4] for p in sheets_paths ] if 'dataset' not in sheet_names: raise ValueError( f"'dataset' sheet not found for tabby record {tabby_prefix!r}") wb = Workbook( # https://openpyxl.readthedocs.io/optimized.html#write-only-mode write_only=True, # prefer ISO 8601 for less interpretation freedom of dates # see https://openpyxl.readthedocs.io/en/stable/datetime.html iso_dates=True, ) for sheet_name, sheet_path in sorted(zip(sheet_names, sheets_paths)): ws = wb.create_sheet( sheet_name, # put the 'dataset' sheet first, and the rest in sorted order # after it 0 if sheet_name == 'dataset' else None, ) with sheet_path.open(newline='') as tsvfile: reader = csv.reader(tsvfile, delimiter='\t') for row in reader: ws.append(row) outfpath = dest / f'{tabby_prefix}.xlsx' wb.save(outfpath) return outfpath
def _sheet2tsv(ws: Worksheet, dest: Path): with dest.open('w', newline='') as tsvfile: writer = csv.writer( tsvfile, delimiter='\t', ) writer.writerows(ws.iter_rows(values_only=True))