Machine Learning without Pandas

In [1]:
# import modules
from pydblite.sqlite import Database, Table
import matplotlib.pylab as plt
from matplotlib import rcParams
import urllib3
import numpy as np
In [14]:
def loadremote_mplrc(url = ''):
    http = urllib3.PoolManager()
    response = http.request('GET', url)
    P={}
    for line in response.data.splitlines():
        l=line.decode('utf-8')
        if not l.startswith("#") and not l.startswith(" ") and l:
            a=l.split(":")
            if len(a) == 2:
                try:
                    ## remove inline comments
                    b = a[1].split("#")
                    P[a[0].strip()]=b[0].strip()
                except:
                    P[a[0].strip()]=a[1].strip()

    #print(P)                
    rcParams.update(P)
In [15]:
# Custom .matplotlibrc
matplotlibrc_url = 'https://tinyurl.com/y6g5r5lk'
loadremote_mplrc(url = matplotlibrc_url)
In [4]:
mldb = Database("../../Data/mlpack.db")
dataset = Table("dataset",mldb)

1. Inspect the dataset

In [5]:
dataset.field_info
Out[5]:
{'a': {'type': 'FLOAT', 'NOT NULL': True, 'DEFAULT': None},
 'b': {'type': 'FLOAT', 'NOT NULL': True, 'DEFAULT': None},
 'c': {'type': 'FLOAT', 'NOT NULL': True, 'DEFAULT': None}}

2. Preview the dataset

In [6]:
[r for r in dataset][:5]
Out[6]:
[{'__id__': 1, 'a': 1.1473823886461334, 'b': 0.02648743211679966, 'c': 4.0},
 {'__id__': 2, 'a': 0.10089323542829873, 'b': 0.882746190832952, 'c': 3.0},
 {'__id__': 3, 'a': -0.041286564091623595, 'b': 0.4004518246793048, 'c': 0.0},
 {'__id__': 4, 'a': 0.6577404364762345, 'b': -0.3854316399997564, 'c': 4.0},
 {'__id__': 5, 'a': -0.05394204585235289, 'b': 0.2685674383264389, 'c': 2.0}]

3. Numerical distribution

In [7]:
x1_4=[r['a'] for r in dataset(c=4.0)]
x2_4=[r['b'] for r in dataset(c=4.0)]
x1_2=[r['a'] for r in dataset(c=2.0)]
x2_2=[r['b'] for r in dataset(c=2.0)]
In [8]:
type(x1_2)
Out[8]:
list
In [9]:
figure = plt.figure()
figure.tight_layout()
bins=15
plt.hist(x1_4,bins=bins,edgecolor='black',label="Feature 1 group 4",alpha=0.7);
plt.hist(x2_4,bins=bins,edgecolor='black',label="Feature 2 group 4",alpha=0.7);
plt.hist(x1_2,bins=bins,edgecolor='black',label="Feature 1 group 2",alpha=0.5);
plt.hist(x2_2,bins=bins,edgecolor='black',label="Feature 2 group 2",alpha=0.5);
ax = figure.gca()
ax.legend()
Out[9]:
<matplotlib.legend.Legend at 0x235e8503550>

4. Features Scatterplot

In [10]:
figure = plt.figure()
figure.tight_layout()
plt.scatter(x1_2,x2_2,marker="o",label="Group 2")
plt.scatter(x1_4,x2_4,marker="x",label="Group 4")
ax = figure.gca()
ax.set_xlabel("X1")
ax.set_ylabel("X2");
ax.legend()
Out[10]:
<matplotlib.legend.Legend at 0x235eac172b0>

5. Correlation Matrix

In [11]:
np.corrcoef(x1_2,x2_2)
Out[11]:
array([[ 1.        , -0.20530199],
       [-0.20530199,  1.        ]])
In [ ]: