Python and Mongol: December 2015

Libraries that are generally used in python are:

import pymongo
import io,json
import csv
import time
from pymongo import MongoClient
import dateutil.tz
import pandas as pd
import numpy as np
from bson import Binary, Code
from bson.json_util import dumps
import re
from bson.objectid import ObjectId
import string
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
from datetime import date
import sys
import unicodedata
from bson.objectid import ObjectId
import statistics as st
from functools import partial
import time
import datetime
from datetime import datetime as dt
from datetime import date
import matplotlib.pyplot as plt
import xlsxwriter
import matplotlib.pyplot as plt
from pylab import *

The next part is utility funtions.

Either you can built it into another library or you can include them in the starting of the code.

# Convert dates into fractions

def toYearFraction(date):
    try:
            def sinceEpoch(date): # returns seconds since epoch
                return time.mktime(date.timetuple())
            s = sinceEpoch

            year = date.year
            startOfThisYear = dt(year=year, month=1, day=1)
            startOfNextYear = dt(year=year+1, month=1, day=1)

            yearElapsed = s(date) - s(startOfThisYear)
            yearDuration = s(startOfNextYear) - s(startOfThisYear)
            fraction = yearElapsed/yearDuration

            return date.year + fraction

        except:
        return None

Once the utility function ends, then the code starts which is more or less the play with data.

Top 10 Panda plays.

pd.column1.str.contains('searchingtext').sum()
searching string in a column and counting the number of times it is encountered

pd.to_datetime(Jobs['RawDate'], errors='coerce')
This command will force all errors to date format so that we are not in trouble later.

pd['column1'].apply(function)
this will apply function to each value in the column

.sub(value1)
to subtract this value from everything

.apply(abs)
take absolute values of each entry

value from each column with the respective values from other columns

(df['col1']).sub(df['col2'])

Using greater than or smaller than, renaming columns and using groupby
df= df[(df['col1']<1) & (df['col2'])>0]
df.rename(columns={'col1': 'col1new'}, inplace=True)
df= df[['PropertyId', 'colnew'] ].groupby(['Id']).count()

Aggregating everything and then summing - mean - etc

df= df[['id', 'col1']].groupby(['id']).agg(['mean','count','sum'])
# check if the sum and the average is correct from the excel file when do you do these
df.rename(columns={'mean': 'Avg','count': 'Countelements','sum': 'Totalforme'}, inplace=True)

List=[long list, separated by , commas]
num=0
end=len(List)
for num in range(0,end):
    #print List[num]
    Finalpd= pd.merge(Properties, List[num], how = 'left', left_on = '_id', right_on='Id',right_index=True)
    num = num+1

This is how we can use several joins using loops.

def connect_to_MongoDB(env):
    ip_config
    return ip_config.loc[env,'IP']

def get_value1(key,entity):
    print entity
    try:
        res = entity[0].get(key)

    except:    res = None
    return res

def get_value(key,entity):
    #print entity
    try:
        res = entity.get(key)
    except:    res = None
    return res

def isnumber(num):
    try:
        print 'num',num
        print 'num.isdigit()',num.isdigit()
        return num.isdigit()
    except:
        return None

Plotting your first graph

plt.hist(temparray, bins=200)
plt.title("Histogram for temparrays")
plt.xlabel("Size ")
plt.ylabel("Frequency")
savefig('filename.png')
plt.clf()

Python and Mongol

Monday, December 28, 2015

Data Manipulation in R using dlply library

Thursday, December 17, 2015

Installing and Connecting MongolDB

Tuesday, December 8, 2015

Introduction to Python for Panda