Source code for pyrfume.keller

import pyrfume

from . import dream


[docs]def load_raw_bmc_data(nrows=None):
    """Load raw data from Keller and Vosshall, 2016 supplement."""
    df_raw = pyrfume.load_data("keller_2016/12868_2016_287_MOESM1_ESM.xlsx", header=2)
    return df_raw


[docs]def format_bmc_data(
    df,  # The raw data frame returned by `load_raw_bmc_data`
    only_dream_subjects=False,  # Whether to only keep DREAM subjects
    only_dream_descriptors=False,  # Whether to only keep DREAM descriptors
    only_dream_molecules=False,
):  # Whether to only keep DREAM molecules
    """Format raw data from the BMC paper to be usable for modeling"""
    # Remove leading and trailing white space from column names
    df.columns = df.columns.str.strip()

    # Get the raw DREAM descriptor list
    descriptors_raw = dream.get_descriptors()
    # Get the publication-style descriptor names
    descriptors = dream.get_descriptors(format=True)
    # Revise to the Keller and Vosshall descriptor names
    descriptors_raw[0] = "HOW STRONG IS THE SMELL?"
    descriptors_raw[1] = "HOW PLEASANT IS THE SMELL?"

    # Possibly include "Familiarity" as a descriptor
    if not only_dream_descriptors:
        descriptors_raw.append("HOW FAMILIAR IS THE SMELL?")
        descriptors.append("Familiarity")

    # Possibly restrict subjects to those used in the DREAM challenge
    # Note that numeric subject IDs in the BMC paper and in the DREAM
    # challenge are not identical
    if only_dream_subjects:
        df["Subject"] = df["Subject # (DREAM challenge)"].fillna(0).astype(int)
        df = df[df["Subject"] > 0]
    else:
        df["Subject"] = df["Subject # (this study)"].astype(int)

    # Rename columns to match DREAM challenge
    df = df.rename(columns={"Odor dilution": "Dilution"})
    df = df.rename(columns=dict(zip(descriptors_raw, descriptors)))

    # Fix CIDs for molecules that only have CAS registry numbers.
    # Geranylacetone didn't have a CID listed in the raw data
    # Isobutyl acetate had the wrong CAS number in the raw data
    df["CID"] = (
        df["CID"]
        .astype(str)
        .str.replace("3796-70-1", "1549778")
        .str.replace("109-19-0", "8038")
        .astype(int)
    )

    # Possibly keep only the 476 DREAM challenge molecules
    if only_dream_molecules:
        dream_CIDs = dream.get_cids()
        assert len(dream_CIDs) == 476
        df = df[df["CID"].isin(dream_CIDs)]

    # Keep only relevant columns
    df = df[["CID", "Dilution", "Subject"] + descriptors]

    # Fill NaN descriptors values with 0 if Intensity is not 0.
    df = df.apply(lambda x: x.fillna(0) if x["Intensity"] > 0 else x, axis=1)

    # Make dilution values integer -log10 dilutions
    df["Dilution"] = df["Dilution"].apply(dream.dilution_to_magnitude).astype(float)

    # Set index and set column axis name
    df = df.set_index(["CID", "Dilution", "Subject"])
    df.columns.name = "Descriptor"

    # Identify replicates and add this information to the index
    df["Replicate"] = df.index.duplicated().astype(int)
    df = df.reset_index().set_index(["CID", "Dilution", "Replicate", "Subject"])
    if only_dream_subjects:
        # DREAM subjects replicates should be properly indexed now
        assert df.index.duplicated().sum() == 0

    # Rearrange dataframe to pivot subjects and descriptors
    df = df.unstack("Subject").stack("Descriptor")
    df = df.reorder_levels(["Descriptor", "CID", "Dilution", "Replicate"])
    df = df.sort_index()

    return df
Source code for pyrfume.keller

Pyrfume

Navigation

Related Topics