Limpieza de datos usando SQL.
Realizar un proceso de limpieza a un set de datos utilizando la herramienta SQL Server. Efectuando distintas tareas como visualizaciones de datos, correcciones de valores, actualizaciones de tablas, eliminación de espacios innecesarios, etc.
Este es un proyecto que hace parte de la lista de reproducción de proyectos para el desarrollo de un portafolio de Alex The Analyst.
En los archivos podrán encontrar:
- Nashville Housing Data for Data Cleaning.xlsx--> Archivo de excel con los datos a limpiar.
- Limpieza de datos en SQL.pptx--> Presentación en la que de manera resumida se muestran los pasos ejecutados para la limpieza en SQL.
- LimpiezaDeDatos.sql--> Archivo en el que se encuentran las consultas ejecutadas.
- Cheat Sheet SQL Data Preparation.jpg--> Imagen que contiene algunas de las consultas más utilizadas para la preparación de datos en SQL.
Link a la lista de reproducción de Alex. https://www.youtube.com/watch?v=8rO7ztF4NtU&list=PLUaB-1hjhk8H48Pj32z4GZgGWyylqv85f&index=3&ab_channel=AlexTheAnalyst
SELECT TOP 1000 *
FROM Nashville..Houses;
Eliminación de los valores de horas, ya que solo contiene información de fecha
SELECT SaleDate
FROM Nashville..Houses;
SELECT SaleDate , CONVERT(date, SaleDate)
FROM Nashville..Houses;
UPDATE Nashville..Houses
SET SaleDate = CONVERT(date, SaleDate);
ALTER TABLE Nashville..Houses
ADD SaleDateConverted Date;
UPDATE Nashville..Houses
SET SaleDateConverted = CONVERT(date, SaleDate);
ALTER TABLE Nashville..Houses
DROP COLUMN SaleDate;
SELECT SaleDateConverted
FROM Nashville..Houses
WHERE SaleDateConverted IS NULL;
SELECT PropertyAddress
FROM Nashville..Houses;
SELECT PropertyAddress
FROM Nashville..Houses
WHERE PropertyAddress IS NULL;
SELECT COUNT(*)
FROM Nashville..Houses
WHERE PropertyAddress IS NULL;
SELECT *
FROM Nashville..Houses
WHERE PropertyAddress IS NULL;
SELECT COUNT(PropertyAddress), ParcelID, PropertyAddress
FROM Nashville..Houses
GROUP BY ParcelID, PropertyAddress
HAVING COUNT (PropertyAddress) > 1
ORDER BY ParcelID
SELECT ParcelID, PropertyAddress
FROM Nashville..Houses
WHERE ParcelID IN
(SELECT ParcelID
FROM Nashville..Houses
GROUP BY ParcelID, PropertyAddress
HAVING COUNT (PropertyAddress) > 1)
ORDER BY ParcelID
SELECT a.ParcelID, a.PropertyAddress, b.ParcelID, b.PropertyAddress, ISNULL(a.PropertyAddress, b.PropertyAddress)
FROM Nashville..Houses AS a
JOIN Nashville..Houses AS b
ON a.ParcelID = b.ParcelID
AND a.[UniqueID ]<> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL;
UPDATE a
SET PropertyAddress = ISNULL(a.PropertyAddress, b.PropertyAddress)
FROM Nashville..Houses AS a
JOIN Nashville..Houses AS b
ON a.ParcelID = b.ParcelID
AND a.[UniqueID ]<> b.[UniqueID ]
WHERE a.PropertyAddress IS NULL;
SELECT DISTINCT(PropertyAddress)
FROM Nashville..Houses;
SELECT
SUBSTRING (PropertyAddress, 1, CHARINDEX(',', PropertyAddress) - 1) as Direccion
FROM Nashville..Houses;
SELECT
SUBSTRING (PropertyAddress, CHARINDEX(',', PropertyAddress) + 2, LEN(PropertyAddress)) as Ciudad
FROM Nashville..Houses;
ALTER TABLE Nashville..Houses
ADD PropertyDireccion NVARCHAR(255)
UPDATE Nashville..Houses
SET PropertyDireccion = (SUBSTRING (PropertyAddress, 1, CHARINDEX(',', PropertyAddress) - 1));
ALTER TABLE Nashville..Houses
ADD PropertyCiudad NVARCHAR(255)
UPDATE Nashville..Houses
SET PropertyCiudad = SUBSTRING (PropertyAddress, CHARINDEX(',', PropertyAddress) + 2, LEN(PropertyAddress));
SELECT TOP 100 *
FROM Nashville..Houses;
SELECT DISTINCT(OwnerAddress)
FROM Nashville..Houses;
SELECT PARSENAME(REPLACE(OwnerAddress, ',', '.'),3),
PARSENAME(REPLACE(OwnerAddress, ',', '.'),2),
PARSENAME(REPLACE(OwnerAddress, ',', '.'),1)
FROM Nashville..Houses;
ALTER TABLE Nashville..Houses
ADD OwnerDireccion NVARCHAR(255)
UPDATE Nashville..Houses
SET OwnerDireccion = PARSENAME(REPLACE(OwnerAddress, ',', '.'),3);
ALTER TABLE Nashville..Houses
ADD OwnerCiudad NVARCHAR(255)
UPDATE Nashville..Houses
SET OwnerCiudad = PARSENAME(REPLACE(OwnerAddress, ',', '.'),2);
ALTER TABLE Nashville..Houses
ADD OwnerEstado NVARCHAR(255)
UPDATE Nashville..Houses
SET OwnerEstado = PARSENAME(REPLACE(OwnerAddress, ',', '.'),1);
##- Revisión de valores en SoldAsVacant
SELECT DISTINCT (SoldAsVacant), COUNT (SoldAsVacant) AS Frecuencia
FROM Nashville..Houses
GROUP BY SoldAsVacant
ORDER BY Frecuencia;
SELECT SoldAsVacant,
CASE WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END
FROM Nashville..Houses
UPDATE Nashville..Houses
SET SoldAsVacant = (CASE WHEN SoldAsVacant = 'Y' THEN 'Yes'
WHEN SoldAsVacant = 'N' THEN 'No'
ELSE SoldAsVacant
END)
Consulta de registros duplicados según las columnas: ParcelId, PropertyAddres, SalePrice, SaleDate, LegalReference
WITH TablaTem AS(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY ParcelId,
PropertyAddress,
SalePrice,
SaleDateConverted,
LegalReference
ORDER BY
UniqueID
) AS DUPLICADO
FROM Nashville..Houses
)
SELECT *
FROM TablaTem
WHERE DUPLICADO >1
WITH TablaTem AS(
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY ParcelId,
PropertyAddress,
SalePrice,
SaleDateConverted,
LegalReference
ORDER BY
UniqueID
) AS DUPLICADO
FROM Nashville..Houses
)
DELETE
FROM TablaTem
WHERE DUPLICADO >1
ALTER TABLE Nashville..Houses
DROP COLUMN OwnerAddress, TaxDistrict, PropertyAddress
SELECT LandUse, TRIM(REPLACE(LandUse,' ','')), LEN(LandUse), LEN(TRIM(REPLACE(LandUse,' ','')))
FROM Nashville..Houses
WHERE LEN(LandUse) <> LEN(TRIM(REPLACE(LandUse,' ','')))
UPDATE Nashville..Houses
SET LandUse = (CASE WHEN LandUse = 'CONDOMINIUM OFC OR OTHER COM CONDO' THEN 'CONDOMINIUM OFC OR OTHER COM CONDO'
ELSE LandUse
END)
SELECT LandUse, COUNT(LandUse) AS Frecuencia
FROM Nashville..Houses
GROUP BY LandUse
ORDER BY Frecuencia
SELECT [UniqueID], TRIM(REPLACE([UniqueID],' ','')), LEN([UniqueID]), LEN(TRIM(REPLACE([UniqueID],' ','')))
FROM Nashville..Houses
WHERE LEN([UniqueID]) <> LEN(TRIM(REPLACE([UniqueID],' ','')))
SELECT ParcelId, TRIM(REPLACE(ParcelId,' ','')), LEN(ParcelId), LEN(TRIM(REPLACE(ParcelId,' ','')))
FROM Nashville..Houses
WHERE LEN(ParcelId) <> LEN(TRIM(REPLACE(ParcelId,' ','')))
SELECT LegalReference, TRIM(REPLACE(LegalReference,' ','')), LEN(LegalReference), LEN(TRIM(REPLACE(LegalReference,' ','')))
FROM Nashville..Houses
WHERE LEN(LegalReference) <> LEN(TRIM(REPLACE(LegalReference,' ','')))
Validación de atributo OwnerName con valores con espacios adicionales, espacios dobles al interior de la cadena.
SELECT OwnerName, TRIM(REPLACE(OwnerName,' ','')), LEN(OwnerName), LEN(TRIM(REPLACE(OwnerName,' ','')))
FROM Nashville..Houses
WHERE LEN(OwnerName) <> LEN(TRIM(REPLACE(OwnerName,' ','')));
UPDATE Nashville..Houses
SET OwnerName = TRIM(REPLACE(OwnerName,' ',' '));
SELECT OwnerName, TRIM(REPLACE(OwnerName,' ','')), LEN(OwnerName), LEN(TRIM(REPLACE(OwnerName,' ','')))
FROM Nashville..Houses
WHERE LEN(OwnerName) <> LEN(TRIM(REPLACE(OwnerName,' ','')));
Validación de atributo PropertyDireccion con valores con espacios adicionales, 55788 valores líneas con espacios dobles, triples, al inicio o final.
SELECT PropertyDireccion, TRIM(REPLACE(PropertyDireccion,' ','')), LEN(PropertyDireccion), LEN(TRIM(REPLACE(PropertyDireccion,' ','')))
FROM Nashville..Houses
WHERE LEN(PropertyDireccion) <> LEN(TRIM(REPLACE(PropertyDireccion,' ','')));
UPDATE Nashville..Houses
SET PropertyDireccion = TRIM(REPLACE(PropertyDireccion,' ',' '));
UPDATE Nashville..Houses
SET PropertyDireccion = TRIM(REPLACE(PropertyDireccion,' ',' '));
SELECT PropertyCiudad, TRIM(REPLACE(PropertyCiudad,' ','')), LEN(PropertyCiudad), LEN(TRIM(REPLACE(PropertyCiudad,' ','')))
FROM Nashville..Houses
WHERE LEN(PropertyCiudad) <> LEN(TRIM(REPLACE(PropertyCiudad,' ','')));
Validación de atributo OwnerDireccion con valores con espacios adicionales, 25273 valores líneas con espacios dobles, triples, al inicio o final.
SELECT OwnerDireccion, TRIM(REPLACE(OwnerDireccion,' ','')), LEN(OwnerDireccion), LEN(TRIM(REPLACE(OwnerDireccion,' ','')))
FROM Nashville..Houses
WHERE LEN(OwnerDireccion) <> LEN(TRIM(REPLACE(OwnerDireccion,' ','')));
UPDATE Nashville..Houses
SET OwnerDireccion = TRIM(REPLACE(OwnerDireccion,' ',' '));
UPDATE Nashville..Houses
SET OwnerDireccion = TRIM(REPLACE(OwnerDireccion,' ',' '));
Validación de atributo OwnerCiudad con valores con espacios adicionales, 25969 valores líneas con espacios dobles, triples, al inicio o final.
SELECT OwnerCiudad, TRIM(REPLACE(OwnerCiudad,' ','')), LEN(OwnerCiudad), LEN(TRIM(REPLACE(OwnerCiudad,' ','')))
FROM Nashville..Houses
WHERE LEN(OwnerCiudad) <> LEN(TRIM(REPLACE(OwnerCiudad,' ','')));
SELECT DISTINCT(OwnerCiudad)
FROM Nashville..Houses
WHERE LEN(OwnerCiudad) <> LEN(TRIM(REPLACE(OwnerCiudad,' ','')));
/* Ciudades con error que deberían corregirse en la base de datos OLD HICKORY WHITES CREEK MOUNT JULIET JOELTON GOODLETTSVILLE ANTIOCH BELLEVUE MADISON NASHVILLE NOLENSVILLE HERMITAGE BRENTWOOD */
UPDATE Nashville..Houses
SET OwnerCiudad = TRIM(REPLACE(OwnerCiudad,' ',''));
ALTER TABLE Nashville.dbo.Houses ALTER COLUMN YearBuilt int;
GO