April 2018
Beginner
238 pages
7h 13m
English
We can use the following script:
import pysparkimport csvimport operatorimport itertoolsimport collectionsimport ioif not 'sc' in globals(): sc = pyspark.SparkContext() years = {}occupations = {}guests = {}#The file header contains these column descriptors#YEAR,GoogleKnowlege_Occupation,Show,Group,Raw_Guest_Listwith open('daily_show_guests.csv', newline='') as csvfile: reader = csv.DictReader(csvfile, delimiter=',', quotechar='|') try: for row in reader: #track how many shows occurred in the year year = row['YEAR'] if year in years: years[year] = years[year] + 1 else: years[year] = 1 # what guest occupations were prevalent occupation = row['GoogleKnowlege_Occupation'] if occupation in occupations: occupations[occupation] ...