-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Cleaning SQL.sql
171 lines (130 loc) · 6.01 KB
/
Data Cleaning SQL.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
--Cleaning Data in SQL Queres
Select *
From [portfolioproject].[dbo].[nashville_housing]
----------------------------------------------------------------------------------------------------------------------------
-- Standardise Date Format
Select SaleDate, CONVERT(Date,SaleDate)
From [portfolioproject].[dbo].[nashville_housing]
ALTER TABLE [portfolioproject].[dbo].[nashville_housing]
ADD SalesDateConverted Date;
UPDATE nashville_housing SET
SalesDateConverted = CONVERT(Date,SaleDate);
Select SaleDate,SalesDateConverted, CONVERT(Date,SaleDate)
From [portfolioproject].[dbo].[nashville_housing]
----------------------------------------------------------------------------------------------------------------------------
--Populate Property Address Data
----- Using ParcelID to check if we can get PropertyAddress in another row as seen during data checks
Select a.ParcelID,a.ParcelID,a.PropertyAddress,b.PropertyAddress,ISNULL(a.PropertyAddress, b.PropertyAddress)
From [portfolioproject].[dbo].[nashville_housing] a
JOIN [portfolioproject].[dbo].[nashville_housing] b
ON a.ParcelID = b.ParcelID
AND a.UniqueID <> b.UniqueID -- To make sure that now row links to itself
WHERE a.PropertyAddress IS NULL
--Updating NULL addresses with the found addresses.
----ISNULL function helps us replace null values in Column with values of another or a specified value.
UPDATE a
SET a.PropertyAddress = ISNULL(a.PropertyAddress, b.PropertyAddress)
From [portfolioproject].[dbo].[nashville_housing] a
JOIN [portfolioproject].[dbo].[nashville_housing] b
ON a.ParcelID = b.ParcelID
AND a.UniqueID <> b.UniqueID -- To make sure that now row links to itself
WHERE a.PropertyAddress IS NULL
----------------------------------------------------------------------------------------------------------------------------
-- Breaking Out Address Into Individual Columns (Address, City)
Select PropertyAddress
From [portfolioproject].[dbo].[nashville_housing]
-- Address is separated using a coma(,) delimiter to seperate Address, City: For example "1808 FOX CHASE DR, GOODLETTSVILLE"
---- Action: I'm going to seperate the 3 mentioned above using coma delimeter
SELECT
SUBSTRING(PropertyAddress, 1, CHARINDEX(',',PropertyAddress)-1) as Address,
SUBSTRING(PropertyAddress, CHARINDEX(',',PropertyAddress)+1, LEN(PropertyAddress)) as Address
From [portfolioproject].[dbo].[nashville_housing]
-- Creating the Address Column
ALTER TABLE [portfolioproject].[dbo].[nashville_housing]
ADD PropertySplitAddress Varchar(255);
-- Update the newly created address column with the 1st part of PropertyAddress
UPDATE nashville_housing SET
PropertySplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',',PropertyAddress)-1);
-- Creating the Address Column
ALTER TABLE [portfolioproject].[dbo].[nashville_housing]
ADD PropertySplitCity Varchar(255);
-- Update the newly created address column with the 1st part of PropertyAddress
UPDATE nashville_housing SET
PropertySplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',',PropertyAddress)+1, LEN(PropertyAddress));
----------------------------------------------------------------------------------------------------------------------------
-- Breaking Out Address Into Individual Columns (Address, City, State) - On OwnerAddress
Select OwnerAddress
From [portfolioproject].[dbo].[nashville_housing]
--Using the PARSENAME() Function that only see dots instead of comas hence REPLACE() to replace comas with dots
SELECT
OwnerAddress,
PARSENAME(REPLACE(OwnerAddress,',','.'), 3) as Address,
PARSENAME(REPLACE(OwnerAddress,',','.'), 2) as City,
PARSENAME(REPLACE(OwnerAddress,',','.'), 1) as State
FROM [nashville_housing]
-- Adding OwnerSplitAddress
ALTER TABLE [portfolioproject].[dbo].[nashville_housing]
ADD OwnerSplitAddress Varchar(255);
-- Update the newly created address column with the 1st part of OwnerAddress
UPDATE nashville_housing SET
OwnerSplitAddress = PARSENAME(REPLACE(OwnerAddress,',','.'), 3);
-- Adding OwnerSplitCity
ALTER TABLE [portfolioproject].[dbo].[nashville_housing]
ADD OwnerSplitCity Varchar(255);
-- Update the newly created address column with the 1st part of OwnerAddress
UPDATE nashville_housing SET
OwnerSplitCity = PARSENAME(REPLACE(OwnerAddress,',','.'), 3);
-- Adding OwnerSplitState
ALTER TABLE [portfolioproject].[dbo].[nashville_housing]
ADD OwnerSplitState Varchar(255);
-- Update the newly created address column with the 1st part of OwnerAddress
UPDATE nashville_housing SET
OwnerSplitState = PARSENAME(REPLACE(OwnerAddress,',','.'), 1);
----------------------------------------------------------------------------------------------------------------------------
-- Change Y and N to Yes No in "Sold as Vacant" field
-- Checking column cotains
SELECT DISTINCT(SoldAsVacant), Count(SoldAsVacant)
FROM nashville_housing
GROUP BY SoldAsVacant
ORDER BY 2
-- The column contains 4 different value Y,N,Yes,No
SELECT
SoldAsVacant,
CASE
WHEN SoldAsVacant = 'Y'
THEN 'Yes'
WHEN SoldAsVacant = 'N'
THEN 'No'
ELSE SoldAsVacant
END as SoldAsVacantNew
FROM nashville_housing
-- Updating SoldAsVacant with Yes insted of 'Y' and No instead of 'N'
UPDATE nashville_housing SET
SoldAsVacant = CASE
WHEN SoldAsVacant = 'Y'
THEN 'Yes'
WHEN SoldAsVacant = 'N'
THEN 'No'
ELSE SoldAsVacant
END ;
----------------------------------------------------------------------------------------------------------------------------
-- Remove Duplicates
-- Finding Duplicates
WITH RowNumCTE as (
SELECT *,
ROW_NUMBER() OVER(
PARTITION BY ParcelID,
PropertyAddress,
SalePrice,
SaleDate,
LegalReference
ORDER BY
UniqueID
) row_num
FROM nashville_housing
)
DELETE FROM RowNumCTE WHERE row_num > 1
----------------------------------------------------------------------------------------------------------------------------
-- Delete Unused Columns
ALTER TABLE nashville_housing
DROP COLUMN OwnerAddress, PropertyAddress, TaxDistrict,SaleDate