GitHub

Peer-graded Assignment: Week 3 Part 1 For Capstone Project

Setting up the environment In [1]: import numpy as np import pandas as pd pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None)

import json

#!conda install -c conda-forge geopy --yes from geopy.geocoders import Nominatim

from bs4 import BeautifulSoup from urllib.request import urlopen import requests from pandas.io.json import json_normalize

import matplotlib.cm as cm import matplotlib.colors as colors

from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes import folium

print('Libraries imported.') Libraries imported. 2. Parsing the html In [3]: url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' page = urlopen(url).read().decode('utf-8') soup = BeautifulSoup(page, 'html.parser')

wiki_table = soup.body.table.tbody 3. Extracting data from the table to the data frame In [4]: def get_cell(element): cells = element.find_all('td') row = []

for cell in cells:
    if cell.a:            
        if (cell.a.text):
            row.append(cell.a.text)
            continue
    row.append(cell.string.strip())
    
return row

In [5]: def get_row():
data = []

for tr in wiki_table.find_all('tr'):
    row = get_cell(tr)
    if len(row) != 3:
        continue
    data.append(row)        

return data

In [6]: data = get_row() columns = ['Postcode', 'Borough', 'Neighbourhood'] df = pd.DataFrame(data, columns=columns) df.head() Out[6]: Postcode Borough Neighbourhood 0 M1A Not assigned Not assigned 1 M2A Not assigned Not assigned 2 M3A North York Parkwoods 3 M4A North York Victoria Village 4 M5A Downtown Toronto Harbourfront In [7]: df.shape Out[7]: (289, 3) 4. Cleaning the data In [8]: df1 = df[df.Borough != 'Not assigned'] df1 = df1.sort_values(by=['Postcode','Borough'])

df1.reset_index(inplace=True) df1.drop('index',axis=1,inplace=True)

df1.head() Out[8]: Postcode Borough Neighbourhood 0 M1B Scarborough Rouge 1 M1B Scarborough Malvern 2 M1C Scarborough Highland Creek 3 M1C Scarborough Rouge Hill 4 M1C Scarborough Port Union In [9]: df_postcodes = df1['Postcode'] df_postcodes.drop_duplicates(inplace=True) df2 = pd.DataFrame(df_postcodes) df2['Borough'] = ''; df2['Neighbourhood'] = '';

df2.reset_index(inplace=True) df2.drop('index', axis=1, inplace=True) df1.reset_index(inplace=True) df1.drop('index', axis=1, inplace=True)

for i in df2.index: for j in df1.index: if df2.iloc[i, 0] == df1.iloc[j, 0]: df2.iloc[i, 1] = df1.iloc[j, 1] df2.iloc[i, 2] = df2.iloc[i, 2] + ',' + df1.iloc[j, 2]

for i in df2.index: s = df2.iloc[i, 2] if s[0] == ',': s =s [1:] df2.iloc[i,2 ] = s In [ ]: df2.shape

Name		Name	Last commit message	Last commit date
Latest commit History 2 Commits
README.md		README.md

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

About

Releases

Packages

salarshk/week3

Folders and files

Latest commit

History

Repository files navigation

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Packages