"""Hillstrom challenge dataset.
A dataset from Kevin Hillstrom's MineThatData blog. See
https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html
for details.
"""
import numpy as np
from .base import _fetch_remote_csv
from .base import RemoteFileMetadata
ARCHIVE = RemoteFileMetadata(
filename="Hillstrom.csv",
url=('http://www.minethatdata.com/'
'Kevin_Hillstrom_MineThatData_E-MailAnalytics'
'_DataMiningChallenge_2008.03.20.csv'),
checksum=('0e5893329d8b93cefecc571777672028'
'290ab69865718020c78c7284f291aece'))
[docs]
def fetch_Hillstrom(data_home=None, download_if_missing=True,
random_state=None, shuffle=False,
categ_as_strings=False, return_X_y=False,
as_frame=False):
"""Load the Hillstrom dataset (uplift classification and regression).
Download it if necessary.
Parameters
----------
data_home : string, optional
Specify another download and cache folder for the datasets. By default
all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
download_if_missing : boolean, default=True
If False, raise a IOError if the data is not locally available
instead of trying to download the data from the source site.
random_state : int, RandomState instance or None (default)
Determines random number generation for dataset shuffling. Pass an int
for reproducible output across multiple function calls.
shuffle : bool, default=False
Whether to shuffle dataset.
categ_as_strings : bool, default=False
Whether to return categorical variables as strings.
return_X_y : boolean, default=False.
If True, returns ``(data.data, data.target)`` instead of a Bunch
object.
as_frame : boolean, default=False
If True features are returned as pandas DataFrame. If False
features are returned as object or float array. Float array
is returned if all features are floats.
Returns
-------
dataset : dict-like object with the following attributes:
dataset.data : numpy array
Each row corresponds to the features in the dataset.
dataset.target_visit : numpy array
Each value is 1 if website visit occurred 0 otherwise.
dataset.target_conversion : numpy array
Each value is 1 if purchase occurred 0 otherwise.
dataset.target_spend : numpy array
Each value corresponds to the amount of money spent.
dataset.DESCR : string
Description of the Hillstrom dataset.
(data, target_visit, target_conversion, target_spend) : tuple if
``return_X_y`` is True
"""
# dictionaries
treatment_values = ['No E-Mail', 'Mens E-Mail', 'Womens E-Mail']
history_segment_values = ['1) $0 - $100', '2) $100 - $200',
'3) $200 - $350', '4) $350 - $500',
'5) $500 - $750', '6) $750 - $1,000',
'7) $1,000 +']
zip_code_values = ['Rural', 'Surburban', 'Urban']
channel_values = ['Phone', 'Web', 'Multichannel']
categ_values = {"history_segment": history_segment_values,
"zip_code": zip_code_values,
"channel": channel_values,}
# attribute descriptions
treatment_descr = [("treatment", treatment_values, "segment")]
target_descr = [("target_visit", np.int32, "visit"),
("target_conversion", np.int32, "conversion"),
("target_spend", float, "spend")]
feature_descr = [("recency", np.int32),
("history_segment", history_segment_values),
("history", float),
("mens", np.int32),
("womens", np.int32),
("zip_code", zip_code_values),
("newbie", np.int32),
("channel", channel_values)]
ret = _fetch_remote_csv(ARCHIVE, "Hillstrom",
feature_attrs=feature_descr,
treatment_attrs=treatment_descr,
target_attrs=target_descr,
categ_as_strings=categ_as_strings,
return_X_y=return_X_y, as_frame=as_frame,
download_if_missing=download_if_missing,
random_state=random_state, shuffle=shuffle,
total_attrs=12
)
if not return_X_y:
ret.descr = __doc__
return ret