资源描述:
《大数据预处理代码》由会员上传分享,免费在线阅读,更多相关内容在行业资料-天天文库。
1、第一个例子:importmatplotlib.pyplotaspltimportnumpyasnpfromsklearn.model_selectionimporttrain_test_splitfromsklearn.decompositionimportPCAfromsklearn.pipelineimportmake_pipelinefromsklearn.preprocessingimportFunctionTransformerdef_generate_vector(shift=0.5,noi
2、se=15):returnnp.arange(1000)+(np.random.rand(1000)-shift)*noisedefgenerate_dataset():"""Thisdatasetistwolineswithaslope~1,whereonehasayoffsetof~100"""returnnp.vstack((np.vstack((_generate_vector(),_generate_vector()+100,)).T,np.vstack((_generate_vector()
3、,_generate_vector(),)).T,)),np.hstack((np.zeros(1000),np.ones(1000)))defall_but_first_column(X):returnX[:,1:]defdrop_first_component(X,y):"""CreateapipelinewithPCAandthecolumnselectoranduseittotransformthedataset."""pipeline=make_pipeline(PCA(),FunctionT
4、ransformer(all_but_first_column),)X_train,X_test,y_train,y_test=train_test_split(X,y)pipeline.fit(X_train,y_train)returnpipeline.transform(X_test),y_testif__name__=='__main__':X,y=generate_dataset()lw=0plt.figure()plt.scatter(X[:,0],X[:,1],c=y,lw=lw)plt.
5、figure()X_transformed,y_transformed=drop_first_component(*generate_dataset())plt.scatter(X_transformed[:,0],np.zeros(len(X_transformed)),c=y_transformed,lw=lw,s=60)plt.show()第二个例子:from__future__importprint_functionprint(__doc__)#Codesource:ThomasUnterthi
6、ner#License:BSD3clauseimportmatplotlib.pyplotaspltimportnumpyasnpfromsklearn.preprocessingimportStandardScaler,RobustScaler#Createtrainingandtestdatanp.random.seed(42)n_datapoints=100Cov=[[0.9,0.0],[0.0,20.0]]mu1=[100.0,-3.0]mu2=[101.0,-3.0]X1=np.random.
7、multivariate_normal(mean=mu1,cov=Cov,size=n_datapoints)X2=np.random.multivariate_normal(mean=mu2,cov=Cov,size=n_datapoints)Y_train=np.hstack([[-1]*n_datapoints,[1]*n_datapoints])X_train=np.vstack([X1,X2])X1=np.random.multivariate_normal(mean=mu1,cov=Cov,
8、size=n_datapoints)X2=np.random.multivariate_normal(mean=mu2,cov=Cov,size=n_datapoints)Y_test=np.hstack([[-1]*n_datapoints,[1]*n_datapoints])X_test=np.vstack([X1,X2])X_train[0,0]=-1000#afairlylargeoutlier#Scaledatastandard_