Libraries that are generally used in python are:
import pymongo
import io,json
import csv
import time
from pymongo import MongoClient
import dateutil.tz
import pandas as pd
import numpy as np
from bson import Binary, Code
from bson.json_util import dumps
import re
from bson.objectid import ObjectId
import string
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
from datetime import date
import sys
import unicodedata
from bson.objectid import ObjectId
import statistics as st
from functools import partial
import time
import datetime
from datetime import datetime as dt
from datetime import date
import matplotlib.pyplot as plt
import xlsxwriter
import matplotlib.pyplot as plt
from pylab import *
The next part is utility funtions.
Either you can built it into another library or you can include them in the starting of the code.
# Convert dates into fractions
def toYearFraction(date):
try:
def sinceEpoch(date): # returns seconds since epoch
return time.mktime(date.timetuple())
s = sinceEpoch
year = date.year
startOfThisYear = dt(year=year, month=1, day=1)
startOfNextYear = dt(year=year+1, month=1, day=1)
yearElapsed = s(date) - s(startOfThisYear)
yearDuration = s(startOfNextYear) - s(startOfThisYear)
fraction = yearElapsed/yearDuration
return date.year + fraction
except:
return None
Once the utility function ends, then the code starts which is more or less the play with data.
Top 10 Panda plays.
pd.column1.str.contains('searchingtext').sum()
searching string in a column and counting the number of times it is encountered
pd.to_datetime(Jobs['RawDate'], errors='coerce')
This command will force all errors to date format so that we are not in trouble later.
pd['column1'].apply(function)
this will apply function to each value in the column
.sub(value1)
to subtract this value from everything
.apply(abs)
take absolute values of each entry
value from each column with the respective values from other columns
(df['col1']).sub(df['col2'])
Using greater than or smaller than, renaming columns and using groupby
df= df[(df['col1']<1) & (df['col2'])>0]
df.rename(columns={'col1': 'col1new'}, inplace=True)
df= df[['PropertyId', 'colnew'] ].groupby(['Id']).count()
Aggregating everything and then summing - mean - etc
df= df[['id', 'col1']].groupby(['id']).agg(['mean','count','sum'])
# check if the sum and the average is correct from the excel file when do you do these
df.rename(columns={'mean': 'Avg','count': 'Countelements','sum': 'Totalforme'}, inplace=True)
List=[long list, separated by , commas]
num=0
end=len(List)
for num in range(0,end):
#print List[num]
Finalpd= pd.merge(Properties, List[num], how = 'left', left_on = '_id', right_on='Id',right_index=True)
num = num+1
This is how we can use several joins using loops.
def connect_to_MongoDB(env):
ip_config
return ip_config.loc[env,'IP']
def get_value1(key,entity):
print entity
try:
res = entity[0].get(key)
except: res = None
return res
def get_value(key,entity):
#print entity
try:
res = entity.get(key)
except: res = None
return res
def isnumber(num):
try:
print 'num',num
print 'num.isdigit()',num.isdigit()
return num.isdigit()
except:
return None
Plotting your first graph
plt.hist(temparray, bins=200)
plt.title("Histogram for temparrays")
plt.xlabel("Size ")
plt.ylabel("Frequency")
savefig('filename.png')
plt.clf()