Source code for pyrfume.keller
import pyrfume
from . import dream
[docs]def load_raw_bmc_data(nrows=None):
"""Load raw data from Keller and Vosshall, 2016 supplement."""
df_raw = pyrfume.load_data("keller_2016/12868_2016_287_MOESM1_ESM.xlsx", header=2)
return df_raw
[docs]def format_bmc_data(
df, # The raw data frame returned by `load_raw_bmc_data`
only_dream_subjects=False, # Whether to only keep DREAM subjects
only_dream_descriptors=False, # Whether to only keep DREAM descriptors
only_dream_molecules=False,
): # Whether to only keep DREAM molecules
"""Format raw data from the BMC paper to be usable for modeling"""
# Remove leading and trailing white space from column names
df.columns = df.columns.str.strip()
# Get the raw DREAM descriptor list
descriptors_raw = dream.get_descriptors()
# Get the publication-style descriptor names
descriptors = dream.get_descriptors(format=True)
# Revise to the Keller and Vosshall descriptor names
descriptors_raw[0] = "HOW STRONG IS THE SMELL?"
descriptors_raw[1] = "HOW PLEASANT IS THE SMELL?"
# Possibly include "Familiarity" as a descriptor
if not only_dream_descriptors:
descriptors_raw.append("HOW FAMILIAR IS THE SMELL?")
descriptors.append("Familiarity")
# Possibly restrict subjects to those used in the DREAM challenge
# Note that numeric subject IDs in the BMC paper and in the DREAM
# challenge are not identical
if only_dream_subjects:
df["Subject"] = df["Subject # (DREAM challenge)"].fillna(0).astype(int)
df = df[df["Subject"] > 0]
else:
df["Subject"] = df["Subject # (this study)"].astype(int)
# Rename columns to match DREAM challenge
df = df.rename(columns={"Odor dilution": "Dilution"})
df = df.rename(columns=dict(zip(descriptors_raw, descriptors)))
# Fix CIDs for molecules that only have CAS registry numbers.
# Geranylacetone didn't have a CID listed in the raw data
# Isobutyl acetate had the wrong CAS number in the raw data
df["CID"] = (
df["CID"]
.astype(str)
.str.replace("3796-70-1", "1549778")
.str.replace("109-19-0", "8038")
.astype(int)
)
# Possibly keep only the 476 DREAM challenge molecules
if only_dream_molecules:
dream_CIDs = dream.get_cids()
assert len(dream_CIDs) == 476
df = df[df["CID"].isin(dream_CIDs)]
# Keep only relevant columns
df = df[["CID", "Dilution", "Subject"] + descriptors]
# Fill NaN descriptors values with 0 if Intensity is not 0.
df = df.apply(lambda x: x.fillna(0) if x["Intensity"] > 0 else x, axis=1)
# Make dilution values integer -log10 dilutions
df["Dilution"] = df["Dilution"].apply(dream.dilution_to_magnitude).astype(float)
# Set index and set column axis name
df = df.set_index(["CID", "Dilution", "Subject"])
df.columns.name = "Descriptor"
# Identify replicates and add this information to the index
df["Replicate"] = df.index.duplicated().astype(int)
df = df.reset_index().set_index(["CID", "Dilution", "Replicate", "Subject"])
if only_dream_subjects:
# DREAM subjects replicates should be properly indexed now
assert df.index.duplicated().sum() == 0
# Rearrange dataframe to pivot subjects and descriptors
df = df.unstack("Subject").stack("Descriptor")
df = df.reorder_levels(["Descriptor", "CID", "Dilution", "Replicate"])
df = df.sort_index()
return df