Skip to content

Commit 2142609

Browse files
committed
added new function to load single files with custom csv formats, plus wrapper for default qPCR data
1 parent 15c3290 commit 2142609

File tree

3 files changed

+245
-0
lines changed

3 files changed

+245
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
.vscode
22
.ruff_cache
33
.*_cache
4+
.DS_Store
45
# Byte-compiled / optimized / DLL files
56
__pycache__/
67
*.py[cod]

src/rushd/flow.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,10 @@ class GroupsError(RuntimeError):
4141
"""Error raised when there is an issue with the data groups DataFrame."""
4242

4343

44+
class ColumnError(RuntimeError):
45+
"""Error raised when the data is missing a column specifying well IDs."""
46+
47+
4448
class MOIinputError(RuntimeError):
4549
"""Error raised when there is an issue with the provided dataframe."""
4650

@@ -241,6 +245,99 @@ def load_groups_with_metadata(
241245
return data
242246

243247

248+
def load_single_csv_with_metadata(
249+
data_path: Union[str, Path],
250+
yaml_path: Union[str, Path],
251+
*,
252+
well_column: Optional[str] = None,
253+
columns: Optional[List[str]] = None,
254+
csv_kwargs: Optional[Dict[str, Any]] = {},
255+
) -> pd.DataFrame:
256+
"""
257+
Load .csv data into DataFrame with associated metadata.
258+
259+
Generates a pandas DataFrame from a single .csv file located at the given path,
260+
adding columns for metadata encoded by a given .yaml file. Metadata is associated
261+
with the data based on well IDs encoded in one of the data columns.
262+
263+
Parameters
264+
----------
265+
data_path: str or Path
266+
Path to directory containing data files (.csv)
267+
yaml_path: str or Path
268+
Path to .yaml file to use for associating metadata with well IDs.
269+
All metadata must be contained under the header 'metadata'.
270+
columns: Optional list of strings
271+
If specified, only the specified columns are loaded out of the CSV files.
272+
This can drastically reduce the amount of memory required to load
273+
flow data.
274+
275+
Returns
276+
-------
277+
A single pandas DataFrame containing all data with associated metadata.
278+
"""
279+
if not isinstance(data_path, Path):
280+
data_path = Path(data_path)
281+
282+
try:
283+
metadata_map = load_well_metadata(yaml_path)
284+
except FileNotFoundError as err:
285+
raise YamlError("Specified metadata YAML file does not exist!") from err
286+
287+
# Load data from a single .txt file
288+
file = data_path
289+
290+
# Load the first row so we get the column names
291+
df_onerow = pd.read_csv(file, nrows=1, **csv_kwargs)
292+
# Load data: we allow extra columns in our column list, so subset it
293+
valid_cols = (
294+
list(set(columns+[well_column]).intersection(set(df_onerow.columns))) if columns is not None else None
295+
)
296+
data = pd.read_csv(file, usecols=valid_cols, **csv_kwargs)
297+
298+
if well_column is not None:
299+
300+
if well_column not in data.columns:
301+
raise(ColumnError(f"The file at 'data_path' does not contain the column '{well_column}'"))
302+
303+
data.rename(columns={well_column: 'well'}, inplace=True)
304+
305+
# Add metadata to DataFrame
306+
metadata = pd.DataFrame.from_dict(metadata_map).reset_index(names='well')
307+
data = data.merge(metadata, how='left', on='well').replace(np.nan, pd.NA)
308+
309+
return data
310+
311+
def load_qpcr_with_metadata(
312+
data_path: Union[str, Path],
313+
yaml_path: Union[str, Path],
314+
) -> pd.DataFrame:
315+
"""
316+
Load qPCR data into DataFrame with associated metadata.
317+
318+
Wrapper for 'load_single_csv_with_metadata' using default file format for qPCR data.
319+
320+
Generates a pandas DataFrame from a single .csv file located at the given path,
321+
adding columns for metadata encoded by a given .yaml file. Metadata is associated
322+
with the data based on well IDs encoded in one of the data columns.
323+
324+
Parameters
325+
----------
326+
data_path: str or Path
327+
Path to directory containing data files (.csv)
328+
yaml_path: str or Path
329+
Path to .yaml file to use for associating metadata with well IDs.
330+
All metadata must be contained under the header 'metadata'.
331+
332+
Returns
333+
-------
334+
A single pandas DataFrame containing all data (Cp values) with metadata associated with each well.
335+
"""
336+
337+
return load_single_csv_with_metadata(data_path, yaml_path, well_column='Pos', columns=['Cp'],
338+
csv_kwargs=dict(sep='\t', header=1))
339+
340+
244341
def moi(
245342
data_frame: pd.DataFrame,
246343
color_column_name: str,

tests/test_flow.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -588,3 +588,150 @@ def test_group_custom_regex(tmp_path: Path):
588588
print(df["plate"])
589589
print(df_manual["plate"])
590590
pd.testing.assert_frame_equal(df_manual, df)
591+
592+
def test_single_csv(tmp_path: Path):
593+
"""
594+
Tests that a single file can be read using defaults
595+
"""
596+
with open(str(tmp_path / "test.yaml"), "w") as f:
597+
f.write(
598+
"""
599+
metadata:
600+
condition:
601+
- cond1: A1
602+
"""
603+
)
604+
with open(str(tmp_path / "data.csv"), "w") as f:
605+
f.write("""well,channel1,channel2\nA1,1,2""")
606+
yaml_path = str(tmp_path) + "/test.yaml"
607+
df = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.csv", yaml_path)
608+
df.sort_values(by="well", inplace=True, ignore_index=True)
609+
610+
data = [["A1", 1, 2, "cond1"]]
611+
df_manual = pd.DataFrame(
612+
data, columns=["well", "channel1", "channel2", "condition"]
613+
)
614+
assert df.equals(df_manual)
615+
616+
def test_single_csv_kwargs(tmp_path: Path):
617+
"""
618+
Tests that a single file can be read using custom kwargs
619+
"""
620+
with open(str(tmp_path / "test.yaml"), "w") as f:
621+
f.write(
622+
"""
623+
metadata:
624+
condition:
625+
- cond1: A1
626+
"""
627+
)
628+
with open(str(tmp_path / "data.txt"), "w") as f:
629+
f.write("""well\tchannel1\tchannel2\nA1\t1\t2""")
630+
yaml_path = str(tmp_path) + "/test.yaml"
631+
df = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.txt", yaml_path, csv_kwargs=dict(sep='\t'))
632+
df.sort_values(by="well", inplace=True, ignore_index=True)
633+
634+
data = [["A1", 1, 2, "cond1"]]
635+
df_manual = pd.DataFrame(
636+
data, columns=["well", "channel1", "channel2", "condition"]
637+
)
638+
assert df.equals(df_manual)
639+
640+
def test_single_csv_well_column(tmp_path: Path):
641+
"""
642+
Tests that a single file can be read using custom well column
643+
"""
644+
with open(str(tmp_path / "test.yaml"), "w") as f:
645+
f.write(
646+
"""
647+
metadata:
648+
condition:
649+
- cond1: A1
650+
"""
651+
)
652+
with open(str(tmp_path / "data.csv"), "w") as f:
653+
f.write("""my_well,channel1,channel2\nA1,1,2""")
654+
yaml_path = str(tmp_path) + "/test.yaml"
655+
df = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.csv", yaml_path, well_column='my_well')
656+
df.sort_values(by="well", inplace=True, ignore_index=True)
657+
658+
data = [["A1", 1, 2, "cond1"]]
659+
df_manual = pd.DataFrame(
660+
data, columns=["well", "channel1", "channel2", "condition"]
661+
)
662+
assert df.equals(df_manual)
663+
# Reload specifying columns
664+
df = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.csv", yaml_path, well_column='my_well',
665+
columns=['channel1'])
666+
assert "channel1" in df.columns
667+
assert "channel2" not in df.columns
668+
669+
def test_single_csv_well_column_error(tmp_path: Path):
670+
"""
671+
Tests that a custom well column that is missing from the data correctly raises an error
672+
"""
673+
with open(str(tmp_path / "test.yaml"), "w") as f:
674+
f.write(
675+
"""
676+
metadata:
677+
condition:
678+
- cond1: A1
679+
"""
680+
)
681+
with open(str(tmp_path / "data.csv"), "w") as f:
682+
f.write("""my_well,channel1,channel2\nA1,1,2""")
683+
yaml_path = str(tmp_path) + "/test.yaml"
684+
685+
with pytest.raises(flow.ColumnError):
686+
_ = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.csv", yaml_path, well_column='other_well')
687+
688+
def test_qpcr_loading(tmp_path: Path):
689+
"""
690+
Tests that a file with the qPCR default output can be loaded
691+
"""
692+
with open(str(tmp_path / "test.yaml"), "w") as f:
693+
f.write(
694+
"""
695+
metadata:
696+
condition:
697+
- cond1: A1
698+
"""
699+
)
700+
with open(str(tmp_path / "data.txt"), "w") as f:
701+
f.write("""Nonsense first line\nPos\tCp\textra channel\nA1\t1\t2""")
702+
yaml_path = str(tmp_path) + "/test.yaml"
703+
df = flow.load_qpcr_with_metadata(str(tmp_path)+ "/data.txt", yaml_path)
704+
df.sort_values(by="well", inplace=True, ignore_index=True)
705+
706+
data = [["A1", 1, "cond1"]]
707+
df_manual = pd.DataFrame(
708+
data, columns=["well", "Cp", "condition"]
709+
)
710+
assert df.equals(df_manual)
711+
712+
def test_qpcr_loading_real_data(tmp_path: Path):
713+
"""
714+
Tests that a file with the qPCR default output can be loaded,
715+
copy-pasting from actual qPCR output file
716+
"""
717+
with open(str(tmp_path / "test.yaml"), "w") as f:
718+
f.write(
719+
"""
720+
metadata:
721+
condition:
722+
- cond1: A1
723+
"""
724+
)
725+
with open(str(tmp_path / "data.txt"), "w") as f:
726+
f.write("""Experiment: 2025.08.07_galloway-gaprun-lib-quant_KL Selected Filter: SYBR Green I / HRM Dye (465-510)
727+
Include Color Pos Name Cp Concentration Standard Status
728+
True 255 A1 Sample 1 27.23 0 """)
729+
yaml_path = str(tmp_path) + "/test.yaml"
730+
df = flow.load_qpcr_with_metadata(str(tmp_path)+ "/data.txt", yaml_path)
731+
df.sort_values(by="well", inplace=True, ignore_index=True)
732+
733+
data = [["A1", 27.23, "cond1"]]
734+
df_manual = pd.DataFrame(
735+
data, columns=["well", "Cp", "condition"]
736+
)
737+
assert df.equals(df_manual)

0 commit comments

Comments
 (0)