import pandas as pd import numpy as np import matplotlib.pyplot as plt import sklearn as sk #dfRatingsTropGrand = pd.read_csv("processedData/actorsRatingsPerMovie.tsv",sep='\t') #tconst ratings actorNames averageRatingMovie #dfRatings = dfRatingsTropGrand[dfRatingsTropGrand['ratings'].apply(lambda x: len(eval(x)) >= 4)] #dfRatings.to_csv("processedData/actorsRatingsPerMovieGoodToUse.tsv", index=False, sep="\t") dfRatings = pd.read_csv("processedData/actorsRatingsPerMovieGoodToUse.tsv", sep="\t") #listMovies = dfRatings.head(1000)['tconst'].values listMovies = dfRatings['tconst'].values listRatingsA = [] listRatingsM = [] datas = [] nbDiese = 0 for i in range(len(listMovies)): print(i/len(listMovies)*100,"%", end="\r") film = listMovies[i] bob = (dfRatings.averageRatingMovie.loc[dfRatings.tconst == film].values[0],eval(dfRatings.ratings.loc[dfRatings.tconst == film].values[0])) listRatingsA.append(bob[1][:4]) listRatingsM.append(bob[0]) print("") from sklearn.model_selection import train_test_split x=listRatingsA y=listRatingsM xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.3) xtrain = np.array(xtrain) from sklearn.linear_model import LinearRegression lnrg = LinearRegression() #clf = lnrg.fit(xtrain,ytrain) clf = lnrg.fit(x,y)