Streamlining Data Analysis: Cleaning a Large Laptop Dataset with SQL
When it comes to data analysis, one of the most important steps is cleaning the dataset. This is because datasets are often messy and incomplete, with missing values, duplicates, and inconsistencies that can make it difficult to extract meaningful insights from the data. In this blog post, we will walk you through the process of cleaning a dataset of approximately one thousand laptops using SQL.
-- So congratulation, now we have cleaned data and ready for performing EDA. In this way we also saved lots of memory and reduce the cost of
-- So, let's start with cleaning the dataset but first create a separate database for it.
drop database laptopdb;
create database blogs;
use blogs;
select * from laptops;
desc laptops;
-- to prevent our original data we will create backup for it.
create table backup_laptop like laptops;
insert into backup_laptop
select * from laptops;
select * from backup_laptop;
-- here we will cleaned data by different steps
-- 1. single column based
-- 2. multiple column based
-- 1. dropping column
alter table laptops
drop column `unnamed: 0`;
-- 2. adding auto_increment column at first
ALTER TABLE laptops
ADD COLUMN id INT AUTO_INCREMENT FIRST,
ADD PRIMARY KEY (id);
-- 3. check table description
desc laptops;
-- 4. renaming RAM to RAM(GB) and removing GB from the values
alter table laptops
rename column `Ram` to `RAM(GB)`;
update laptops
set `RAM(GB)` = replace(`RAM(GB)`,'GB','');
select * from laptops;
-- 5. rounding off price column and making data type int.
update laptops
set price = round(price);
alter table laptops
modify price int;
-- now we will work on screen resolution
alter table laptops
modify column Resolution varchar(255) after screenResolution;
update laptops
set Resolution = substring_index(screenResolution,' ',-1);
select * from laptops;
-- now we will create a new column as CPU_speed(GHz)
alter table laptops
add column `Speed(GHz)` decimal(3,2) after `cpu`;
update laptops
set `Speed(GHz)` = replace(substring_index(cpu,' ',-1) ,'GHz','');
select * from laptops;
-- adding one more column as CPU_brand
alter table laptops
add column CPU_brand varchar(255) after cpu;
update laptops
set CPU_brand = substring_index(cpu,' ',1);
-- now we can easily extract processor info and make a column.
alter table laptops
add column Processor varchar(255) after cpu;
update laptops
set processor = replace(replace(cpu,cpu_brand,''),substring_index(replace(cpu,cpu_brand,''),' ',-1),'');
-- now we will drop unused column
alter table laptops
drop column cpu;
select * from laptops;
-- now we will try to extract info related to screen type from ScreenResolution column.
alter table laptops
add column ScreenType varchar(255) after screenResolution;
update laptops
set ScreenType = trim(replace(screenResolution,resolution,''));
select * from laptops;
-- now again we don't need ScrenResolution column
alter table laptops
drop column ScreenResolution;
-- inserting N/A where the value is missing.
update laptops
set ScreenType = 'N/A'
where ScreenType = '';
select * from laptops;
-- My main aim to clean the data but we will do it level wise so first we will clean simple data. Now I will remove 'kg'
-- from weight column cause it is not needed. Lets rename the column first.
update laptops
set weight = replace(weight,'kg','');
alter table laptops
change column `weight` `Weight(KG)` float(5,2);
-- now we will work on OpSys
select opsys,count(*) from laptops
group by opsys;
update laptops
set opsys = case
when opsys like 'windows%' then 'Windows'
when opsys like 'linux' then 'Linux'
when opsys like 'no os' then 'N/A'
when opsys like '%mac%' then 'MacOS'
else 'Other'
end;
select * from laptops;
-- no we will see that we got success to reduce memory or not
SELECT DATA_LENGTH/1024 FROM information_schema.TABLES
WHERE TABLE_SCHEMA = "blogs"
AND TABLE_NAME= 'laptops';
-- now we will drop non required column.
select * from laptops;
alter table laptops
drop column gpu;
-- now we will work on memory section
select memory, count(*) from laptops
group by memory;
alter table laptops
add column storage_type varchar(255) after memory,
add column `primary_storage(GB)` varchar(255) after storage_type,
add column `secondry_storage(GB)` varchar(255) after `primary_storage(GB)`;
select * from laptops;
-- now we will seperate storage type
update laptops
set storage_type =
case
when memory like '%SSD%' and memory like '%HDD%' then 'Hybrid'
when memory like '%Flash%' and memory like '%HDD%' then 'Hybrid'
when memory like '%ssd%' and memory like '%Hybrid%' then 'Hybrid'
when memory like '%ssd%' then 'SSD'
when memory like '%HDD%' then 'HDD'
when memory like '%Hybrid%' then 'Hybrid'
when memory like '%Flash Storage%' then 'Flash Storage'
end;
-- now we will divide memory into primary and secondry storage
update laptops
set `primary_storage(GB)` = regexp_substr(substring_index(memory,'+',1),'[0-9]+'),
`secondry_storage(GB)` =
case
when
memory like '%+%' then regexp_substr(substring_index(memory,'+',-1),'[0-9]+')
else 0
end;
-- now we will change datatype of some column to reduce memory uses
alter table laptops
modify column `primary_storage(GB)` smallint unsigned,
modify column `secondry_storage(GB)` smallint unsigned,
modify column `RAM(GB)` tinyint unsigned,
modify column `company` varchar(255),
modify column `TypeName` varchar(255),
modify column `opsys` varchar(255);
desc laptops;
-- now we will seperate resolution width and height from resolution table so that in future we can create PPI as well.
select * from laptops;
alter table laptops
add column resolution_width smallint unsigned after resolution ,
add column resolution_height smallint unsigned after resolution_width,
add column ppi decimal(5,2) after resolution_height;
update laptops
set resolution_width = substring_index(resolution,'x',1),
resolution_height = substring_index(resolution,'x',-1),
ppi = (resolution_width + resolution_height)/Inches;
-- now we additionally adding one more column touchscreen
alter table laptops
add column touchscreen boolean after screentype;
update laptops
set touchscreen =
case
when screentype like '%touchscreen%' then 1 else 0
end;
select * from laptops;
-- now we don't need resolution column so we gonna drop it.
alter table laptops
drop resolution;
-- we also don't need memory column so lets delete it.
alter table laptops
drop column memory;
-- it doesn't sound good screentype has written touchscreen because we already have seperate column for it.
update laptops
set screentype = case when screentype like 'N/A' then 'N/A' else substring_index(screentype,'/','1') end;
Comments
Post a Comment