Representing Data in R – Python equivalent

import pandas as pd
import numpy as np
# 'characters' is equivalent to string
firstName = 'jeff'
print((type(firstName), firstName))
<type 'str'> jeff
# 'numeric' is equivalent to float
heightCM = 188.2
print((type(heightCM), heightCM))
<type 'float'> 188.2
# integer is equivalent to integer
numberSons = 1
print((type(numberSons), numberSons))
<type 'int'> 1
# 'logical' is equivalent to Boolean
teachingCoursera = True
print((type(teachingCoursera), teachingCoursera))
<type 'bool'> True
# 'vectors' is equivalent to numpy array or Python list (I will use array everywhere for consistency)
heights = np.array([188.2, 181.3, 193.4])
print(heights)

firstNames = np.array(['jeff', 'roger', 'andrew', 'brian'])
print(firstNames)
[ 188.2  181.3  193.4]
['jeff' 'roger' 'andrew' 'brian']
# 'list' is equivalent to dictionary in Python
vector1 = np.array([188.2, 181.3, 193.4])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])
myList = dict(heights = vector1, firstNames = vector2)
print(myList)

print((myList['heights']))
print((myList['firstNames']))
{'firstNames': array(['jeff', 'roger', 'andrew', 'brian'], 
      dtype='|S6'), 'heights': array([ 188.2,  181.3,  193.4])}
[ 188.2  181.3  193.4]
['jeff' 'roger' 'andrew' 'brian']
# 'matrices' is equivalent to two-dimensional numpy array
myMatrix = np.array([[1, 2], [3, 4]])
print(myMatrix)
[[1 2]
 [3 4]]
# data frame is equivalent to Pandas DataFrame
# this example doesn't work because the input array lengths are not the same
vector1 = np.array([188.2, 181.3, 193.4])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])

# ValueError: arrays must all be same length
# 
myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)

<ipython-input-10-58e1535d1fac> in <module>()
      6 # ValueError: arrays must all be same length
      7 #
----> 8 myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))


/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    383             mgr = self._init_mgr(data, index, columns, dtype=dtype, copy=copy)
    384         elif isinstance(data, dict):
--> 385             mgr = self._init_dict(data, index, columns, dtype=dtype)
    386         elif isinstance(data, ma.MaskedArray):
    387             mask = ma.getmaskarray(data)


/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
    515 
    516         return _arrays_to_mgr(arrays, data_names, index, columns,
--> 517                               dtype=dtype)
    518 
    519     def _init_ndarray(self, values, index, columns, dtype=None,


/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   5343     # figure out the index, if necessary
   5344     if index is None:
-> 5345         index = extract_index(arrays)
   5346     else:
   5347         index = _ensure_index(index)


/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in extract_index(data)
   5395             lengths = list(set(raw_lengths))
   5396             if len(lengths) > 1:
-> 5397                 raise ValueError('arrays must all be same length')
   5398 
   5399             if have_dicts:


ValueError: arrays must all be same length
# data frame -- fixed
vector1 = np.array([188.2, 181.3, 193.4, 192.3])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])

myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))
myDataFrame
firstNames heights
0 jeff 188.2
1 roger 181.3
2 andrew 193.4
3 brian 192.3
# factors is equivalent to pandas Categorical
smoker = np.array(['yes', 'no', 'yes', 'yes'])
smokerFactor = pd.Categorical.from_array(smoker)
smokerFactor
Categorical: 
array(['yes', 'no', 'yes', 'yes'], dtype=object)
Levels (2): Index(['no', 'yes'], dtype=object)
# R's NA missing values is equivalent to NaN
vector1 = np.array([188.2, 181.3, 193.4, NaN])
print(vector1)
print((isnan(vector1)))
[ 188.2  181.3  193.4    nan]
[False False False  True]
# subsetting
vector1 = np.array([188.2, 181.3, 193.4, 192.3])
vector2 = np.array(['jeff', 'roger', 'andrew', 'brian'])

myDataFrame = pd.DataFrame(dict(heights = vector1, firstNames = vector2))

print('------------------')
print((vector1[0]))
print('------------------')
print((vector1[[0, 1, 3]]))
print('------------------')
print((myDataFrame.ix[0, 0:2])) # appears transposed as compared to R
print('------------------')
print((myDataFrame['firstNames'])) # there's no 'Levels' as in R
print('------------------')
print((myDataFrame[myDataFrame['firstNames'] == 'jeff']))
print('------------------')
print((myDataFrame[myDataFrame['heights'] < 190]))
------------------
188.2
------------------
[ 188.2  181.3  192.3]
------------------
firstNames     jeff
heights       188.2
Name: 0
------------------
0      jeff
1     roger
2    andrew
3     brian
Name: firstNames
------------------
  firstNames  heights
0       jeff    188.2
------------------
  firstNames  heights
0       jeff    188.2
1      roger    181.3