
分类变量:自动化时代的数据计数
|
73
# 加载前10 000条点评
>>> f = open('yelp_academic_dataset_review.json')
>>> js = []
>>> for i in range(10000):
... js.append(json.loads(f.readline()))
>>> f.close()
>>> review_df = pd.DataFrame(js)
# 定义m为唯一的business_id的数量
>>> m = len(review_df.business_id.unique())
>>> m
528
>>> from sklearn.feature_extraction import FeatureHasher
>>> h = FeatureHasher(n_features=m, input_type='string')
>>> f = h.transform(review_df['business_id'])
# 散列化对特征可解释性的影响
>>> review_df['business_id'].unique().tolist()[0:5]
['vcNAWiLM4dR7D2nwwJ7nCA',
'UsFtqoBl7naz8AVUBZMjQQ',
'cE27W9VPgO88Qxe4ol6y_g',
'HZdLhv6COCleJMo7nPl-RA',
'mVHrayjG3uZ_RLHkLj-AMg'] ...