added new function to load single files with custom csv formats, plus wrapper for default qPCR data

kaseylove · kaseylove · commit 2142609005f8 · 2025-12-01T15:50:17.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .vscode
 .ruff_cache
 .*_cache
+.DS_Store
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/src/rushd/flow.py b/src/rushd/flow.py
@@ -41,6 +41,10 @@ class GroupsError(RuntimeError):
     """Error raised when there is an issue with the data groups DataFrame."""
 
 
+class ColumnError(RuntimeError):
+    """Error raised when the data is missing a column specifying well IDs."""
+
+
 class MOIinputError(RuntimeError):
     """Error raised when there is an issue with the provided dataframe."""
 
@@ -241,6 +245,99 @@ def load_groups_with_metadata(
     return data
 
 
+def load_single_csv_with_metadata(
+    data_path: Union[str, Path],
+    yaml_path: Union[str, Path],
+    *,
+    well_column: Optional[str] = None,
+    columns: Optional[List[str]] = None,
+    csv_kwargs: Optional[Dict[str, Any]] = {},
+) -> pd.DataFrame:
+    """
+    Load .csv data into DataFrame with associated metadata.
+
+    Generates a pandas DataFrame from a single .csv file located at the given path,
+    adding columns for metadata encoded by a given .yaml file. Metadata is associated
+    with the data based on well IDs encoded in one of the data columns.
+
+    Parameters
+    ----------
+    data_path: str or Path
+        Path to directory containing data files (.csv)
+    yaml_path: str or Path
+        Path to .yaml file to use for associating metadata with well IDs.
+        All metadata must be contained under the header 'metadata'.
+    columns: Optional list of strings
+        If specified, only the specified columns are loaded out of the CSV files.
+        This can drastically reduce the amount of memory required to load
+        flow data.
+
+    Returns
+    -------
+    A single pandas DataFrame containing all data with associated metadata.
+    """
+    if not isinstance(data_path, Path):
+        data_path = Path(data_path)
+
+    try:
+        metadata_map = load_well_metadata(yaml_path)
+    except FileNotFoundError as err:
+        raise YamlError("Specified metadata YAML file does not exist!") from err
+
+    # Load data from a single .txt file
+    file = data_path
+
+    # Load the first row so we get the column names
+    df_onerow = pd.read_csv(file, nrows=1, **csv_kwargs)
+    # Load data: we allow extra columns in our column list, so subset it
+    valid_cols = (
+        list(set(columns+[well_column]).intersection(set(df_onerow.columns))) if columns is not None else None
+    )
+    data = pd.read_csv(file, usecols=valid_cols, **csv_kwargs)
+
+    if well_column is not None:
+        
+        if well_column not in data.columns:
+            raise(ColumnError(f"The file at 'data_path' does not contain the column '{well_column}'"))
+        
+        data.rename(columns={well_column: 'well'}, inplace=True)
+
+    # Add metadata to DataFrame
+    metadata = pd.DataFrame.from_dict(metadata_map).reset_index(names='well')
+    data = data.merge(metadata, how='left', on='well').replace(np.nan, pd.NA) 
+
+    return data
+
+def load_qpcr_with_metadata(
+    data_path: Union[str, Path],
+    yaml_path: Union[str, Path],
+) -> pd.DataFrame:
+    """
+    Load qPCR data into DataFrame with associated metadata.
+
+    Wrapper for 'load_single_csv_with_metadata' using default file format for qPCR data.
+
+    Generates a pandas DataFrame from a single .csv file located at the given path,
+    adding columns for metadata encoded by a given .yaml file. Metadata is associated
+    with the data based on well IDs encoded in one of the data columns.
+
+    Parameters
+    ----------
+    data_path: str or Path
+        Path to directory containing data files (.csv)
+    yaml_path: str or Path
+        Path to .yaml file to use for associating metadata with well IDs.
+        All metadata must be contained under the header 'metadata'.
+
+    Returns
+    -------
+    A single pandas DataFrame containing all data (Cp values) with metadata associated with each well.
+    """
+
+    return load_single_csv_with_metadata(data_path, yaml_path, well_column='Pos', columns=['Cp'],
+                                         csv_kwargs=dict(sep='\t', header=1))
+
+
 def moi(
     data_frame: pd.DataFrame,
     color_column_name: str,
diff --git a/tests/test_flow.py b/tests/test_flow.py
@@ -588,3 +588,150 @@ def test_group_custom_regex(tmp_path: Path):
     print(df["plate"])
     print(df_manual["plate"])
     pd.testing.assert_frame_equal(df_manual, df)
+
+def test_single_csv(tmp_path: Path):
+    """
+    Tests that a single file can be read using defaults
+    """
+    with open(str(tmp_path / "test.yaml"), "w") as f:
+        f.write(
+            """
+        metadata:
+            condition:
+            - cond1: A1
+        """
+        )
+    with open(str(tmp_path / "data.csv"), "w") as f:
+        f.write("""well,channel1,channel2\nA1,1,2""")
+    yaml_path = str(tmp_path) + "/test.yaml"
+    df = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.csv", yaml_path)
+    df.sort_values(by="well", inplace=True, ignore_index=True)
+
+    data = [["A1", 1, 2, "cond1"]]
+    df_manual = pd.DataFrame(
+        data, columns=["well", "channel1", "channel2", "condition"]
+    )
+    assert df.equals(df_manual)
+
+def test_single_csv_kwargs(tmp_path: Path):
+    """
+    Tests that a single file can be read using custom kwargs
+    """
+    with open(str(tmp_path / "test.yaml"), "w") as f:
+        f.write(
+            """
+        metadata:
+            condition:
+            - cond1: A1
+        """
+        )
+    with open(str(tmp_path / "data.txt"), "w") as f:
+        f.write("""well\tchannel1\tchannel2\nA1\t1\t2""")
+    yaml_path = str(tmp_path) + "/test.yaml"
+    df = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.txt", yaml_path, csv_kwargs=dict(sep='\t'))
+    df.sort_values(by="well", inplace=True, ignore_index=True)
+
+    data = [["A1", 1, 2, "cond1"]]
+    df_manual = pd.DataFrame(
+        data, columns=["well", "channel1", "channel2", "condition"]
+    )
+    assert df.equals(df_manual)
+
+def test_single_csv_well_column(tmp_path: Path):
+    """
+    Tests that a single file can be read using custom well column
+    """
+    with open(str(tmp_path / "test.yaml"), "w") as f:
+        f.write(
+            """
+        metadata:
+            condition:
+            - cond1: A1
+        """
+        )
+    with open(str(tmp_path / "data.csv"), "w") as f:
+        f.write("""my_well,channel1,channel2\nA1,1,2""")
+    yaml_path = str(tmp_path) + "/test.yaml"
+    df = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.csv", yaml_path, well_column='my_well')
+    df.sort_values(by="well", inplace=True, ignore_index=True)
+
+    data = [["A1", 1, 2, "cond1"]]
+    df_manual = pd.DataFrame(
+        data, columns=["well", "channel1", "channel2", "condition"]
+    )
+    assert df.equals(df_manual)
+    # Reload specifying columns
+    df = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.csv", yaml_path, well_column='my_well',
+                                            columns=['channel1'])
+    assert "channel1" in df.columns
+    assert "channel2" not in df.columns
+
+def test_single_csv_well_column_error(tmp_path: Path):
+    """
+    Tests that a custom well column that is missing from the data correctly raises an error
+    """
+    with open(str(tmp_path / "test.yaml"), "w") as f:
+        f.write(
+            """
+        metadata:
+            condition:
+            - cond1: A1
+        """
+        )
+    with open(str(tmp_path / "data.csv"), "w") as f:
+        f.write("""my_well,channel1,channel2\nA1,1,2""")
+    yaml_path = str(tmp_path) + "/test.yaml"
+    
+    with pytest.raises(flow.ColumnError):
+        _ = flow.load_single_csv_with_metadata(str(tmp_path)+ "/data.csv", yaml_path, well_column='other_well')
+
+def test_qpcr_loading(tmp_path: Path):
+    """
+    Tests that a file with the qPCR default output can be loaded
+    """
+    with open(str(tmp_path / "test.yaml"), "w") as f:
+        f.write(
+            """
+        metadata:
+            condition:
+            - cond1: A1
+        """
+        )
+    with open(str(tmp_path / "data.txt"), "w") as f:
+        f.write("""Nonsense first line\nPos\tCp\textra channel\nA1\t1\t2""")
+    yaml_path = str(tmp_path) + "/test.yaml"
+    df = flow.load_qpcr_with_metadata(str(tmp_path)+ "/data.txt", yaml_path)
+    df.sort_values(by="well", inplace=True, ignore_index=True)
+
+    data = [["A1", 1, "cond1"]]
+    df_manual = pd.DataFrame(
+        data, columns=["well", "Cp", "condition"]
+    )
+    assert df.equals(df_manual)
+
+def test_qpcr_loading_real_data(tmp_path: Path):
+    """
+    Tests that a file with the qPCR default output can be loaded,
+    copy-pasting from actual qPCR output file
+    """
+    with open(str(tmp_path / "test.yaml"), "w") as f:
+        f.write(
+            """
+        metadata:
+            condition:
+            - cond1: A1
+        """
+        )
+    with open(str(tmp_path / "data.txt"), "w") as f:
+        f.write("""Experiment: 2025.08.07_galloway-gaprun-lib-quant_KL  Selected Filter: SYBR Green I / HRM Dye (465-510)
+                Include	Color	Pos	Name	Cp	Concentration	Standard	Status
+                True	255	A1	Sample 1	27.23		0	""")
+    yaml_path = str(tmp_path) + "/test.yaml"
+    df = flow.load_qpcr_with_metadata(str(tmp_path)+ "/data.txt", yaml_path)
+    df.sort_values(by="well", inplace=True, ignore_index=True)
+
+    data = [["A1", 27.23, "cond1"]]
+    df_manual = pd.DataFrame(
+        data, columns=["well", "Cp", "condition"]
+    )
+    assert df.equals(df_manual)