Peer-graded Assignment: Week 3 Part 1 For Capstone Project
- Setting up the environment In [1]: import numpy as np import pandas as pd pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None)
import json
#!conda install -c conda-forge geopy --yes from geopy.geocoders import Nominatim
from bs4 import BeautifulSoup from urllib.request import urlopen import requests from pandas.io.json import json_normalize
import matplotlib.cm as cm import matplotlib.colors as colors
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes import folium
print('Libraries imported.') Libraries imported. 2. Parsing the html In [3]: url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' page = urlopen(url).read().decode('utf-8') soup = BeautifulSoup(page, 'html.parser')
wiki_table = soup.body.table.tbody 3. Extracting data from the table to the data frame In [4]: def get_cell(element): cells = element.find_all('td') row = []
for cell in cells:
if cell.a:
if (cell.a.text):
row.append(cell.a.text)
continue
row.append(cell.string.strip())
return row
In [5]:
def get_row():
data = []
for tr in wiki_table.find_all('tr'):
row = get_cell(tr)
if len(row) != 3:
continue
data.append(row)
return data
In [6]: data = get_row() columns = ['Postcode', 'Borough', 'Neighbourhood'] df = pd.DataFrame(data, columns=columns) df.head() Out[6]: Postcode Borough Neighbourhood 0 M1A Not assigned Not assigned 1 M2A Not assigned Not assigned 2 M3A North York Parkwoods 3 M4A North York Victoria Village 4 M5A Downtown Toronto Harbourfront In [7]: df.shape Out[7]: (289, 3) 4. Cleaning the data In [8]: df1 = df[df.Borough != 'Not assigned'] df1 = df1.sort_values(by=['Postcode','Borough'])
df1.reset_index(inplace=True) df1.drop('index',axis=1,inplace=True)
df1.head() Out[8]: Postcode Borough Neighbourhood 0 M1B Scarborough Rouge 1 M1B Scarborough Malvern 2 M1C Scarborough Highland Creek 3 M1C Scarborough Rouge Hill 4 M1C Scarborough Port Union In [9]: df_postcodes = df1['Postcode'] df_postcodes.drop_duplicates(inplace=True) df2 = pd.DataFrame(df_postcodes) df2['Borough'] = ''; df2['Neighbourhood'] = '';
df2.reset_index(inplace=True) df2.drop('index', axis=1, inplace=True) df1.reset_index(inplace=True) df1.drop('index', axis=1, inplace=True)
for i in df2.index: for j in df1.index: if df2.iloc[i, 0] == df1.iloc[j, 0]: df2.iloc[i, 1] = df1.iloc[j, 1] df2.iloc[i, 2] = df2.iloc[i, 2] + ',' + df1.iloc[j, 2]
for i in df2.index: s = df2.iloc[i, 2] if s[0] == ',': s =s [1:] df2.iloc[i,2 ] = s In [ ]: df2.shape