In [1]:
import numpy          # linear algebra
import urllib         # load data from the web
import scipy.optimize # optimization routines
import random         # random number generation
import ast
In [2]:
def parseData(fname):
  for l in urllib.urlopen(fname):
    yield eval(l)
In [3]:
def parseDataFromFile(fname):
  for l in open(fname):
    #yield eval(l)
    yield ast.literal_eval(l)
In [4]:
#data = list(parseData("http://jmcauley.ucsd.edu/cse258/data/beer/beer_50000.json"))
data = list(parseDataFromFile("C:/Users/Julian McAuley/Documents/class_files/beer_50000.json"))
In [5]:
data[4]
Out[5]:
{'review/appearance': 4.0,
 'beer/style': 'American Double / Imperial IPA',
 'review/palate': 4.0,
 'review/taste': 4.5,
 'beer/name': 'Cauldron DIPA',
 'review/timeUnix': 1293735206,
 'user/gender': 'Male',
 'user/birthdayRaw': 'Jun 16, 1901',
 'beer/ABV': 7.7,
 'beer/beerId': '64883',
 'user/birthdayUnix': -2163081600,
 'beer/brewerId': '1075',
 'review/timeStruct': {'isdst': 0,
  'mday': 30,
  'hour': 18,
  'min': 53,
  'sec': 26,
  'mon': 12,
  'year': 2010,
  'yday': 364,
  'wday': 3},
 'user/ageInSeconds': 3581417047,
 'review/overall': 4.0,
 'review/text': "According to the website, the style for the Caldera Cauldron changes every year. The current release is a DIPA, which frankly is the only cauldron I'm familiar with (it was an IPA/DIPA the last time I ordered a cauldron at the horsebrass several years back). In any event... at the Horse Brass yesterday.\t\tThe beer pours an orange copper color with good head retention and lacing. The nose is all hoppy IPA goodness, showcasing a huge aroma of dry citrus, pine and sandlewood. The flavor profile replicates the nose pretty closely in this West Coast all the way DIPA. This DIPA is not for the faint of heart and is a bit much even for a hophead like myslf. The finish is quite dry and hoppy, and there's barely enough sweet malt to balance and hold up the avalanche of hoppy bitterness in this beer. Mouthfeel is actually fairly light, with a long, persistentely bitter finish. Drinkability is good, with the alcohol barely noticeable in this well crafted beer. Still, this beer is so hugely hoppy/bitter, it's really hard for me to imagine ordering more than a single glass. Regardless, this is a very impressive beer from the folks at Caldera.",
 'user/profileName': 'johnmichaelsen',
 'review/aroma': 4.5}
In [6]:
data = [d for d in data if 'user/ageInSeconds' in d and d['user/ageInSeconds'] < 80*365*24*60*60]
In [7]:
len(data)
Out[7]:
10389
In [8]:
data[0] 
Out[8]:
{'review/appearance': 4.0,
 'beer/style': 'American Pale Lager',
 'review/palate': 4.0,
 'review/taste': 4.0,
 'beer/name': 'Caldera OBF 15',
 'review/timeUnix': 1062311123,
 'user/gender': 'Male',
 'user/birthdayRaw': 'Jun 23, 1958',
 'beer/ABV': 5.6,
 'beer/beerId': '12386',
 'user/birthdayUnix': -363718800,
 'beer/brewerId': '1075',
 'review/timeStruct': {'isdst': 0,
  'mday': 31,
  'hour': 6,
  'min': 25,
  'sec': 23,
  'mon': 8,
  'year': 2003,
  'yday': 243,
  'wday': 6},
 'user/ageInSeconds': 1782054247,
 'review/overall': 4.0,
 'review/text': "More of a 'dry' than a lager, tasted at the 2002 Oregon Brewers Festival. Orange color, orange flavor in nose. Light malts and fairly aggressively hopped; yet it is not very bitter. Interesting taste, complex and subtle. Light yet flavorful. Mouthfeel is full and round. Finish is clean and smooth. Aftertaste is slightly bitter. Nice beer. Would be a great beer to sip during a hot summer day.",
 'user/profileName': 'beerguy101',
 'review/aroma': 3.0}
In [9]:
len(data)
Out[9]:
10389
In [10]:
def feature(datum):
  feat = datum['beer/ABV']
  return [1] + [feat]
In [11]:
X = [feature(d) for d in data]
In [12]:
X[:10]
Out[12]:
[[1, 5.6],
 [1, 7.4],
 [1, 7.4],
 [1, 7.4],
 [1, 7.4],
 [1, 7.4],
 [1, 7.4],
 [1, 5.5],
 [1, 5.5],
 [1, 5.5]]
In [13]:
y = [d['review/overall'] for d in data]
In [14]:
y[:10]
Out[14]:
[4.0, 4.0, 4.5, 4.5, 4.5, 4.0, 4.0, 2.5, 4.5, 4.5]
In [15]:
theta,residuals,rank,s = numpy.linalg.lstsq(X, y)
C:\Users\Julian McAuley\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: `rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.
To use the future default and silence this warning we advise to pass `rcond=None`, to keep using the old, explicitly pass `rcond=-1`.
  """Entry point for launching an IPython kernel.
In [16]:
theta
Out[16]:
array([3.4324085 , 0.06434933])
In [17]:
theta[1]*60*60*24*365
Out[17]:
2029320.3837231065
In [18]:
# Using linear algebra for comparison
In [19]:
X = numpy.matrix(X)
y = numpy.matrix(y)
numpy.linalg.inv(X.T * X) * X.T * y.T
Out[19]:
matrix([[3.4324085 ],
        [0.06434933]])
In [ ]: