Skip to main content

Identify and delete duplicate records in SQL DB

CREATE TABLE DuplicateRecords

( 

             id int IDENTITY(1,1) PRIMARY KEY, 

             first_name VARCHAR(30), 

             last_name VARCHAR(25),  

             email VARCHAR(210), 

             age VARCHAR(22)

);

 

    INSERT INTO DuplicateRecords (first_name,last_name,email,age)  

    VALUES ('Kavin','Peterson','kavin.peterson@verizon.net','21'), 

           ('Nick','Jonas','nick.jonas@me.com','18'), 

           ('Peter','Heaven','peter.heaven@google.com','23'), 

           ('Michal','Jackson','michal.jackson@aol.com','22'), 

           ('Sean','Bean','sean.bean@yahoo.com','23'), 

           ('Tom ','Baker','tom.baker@aol.com','20'), 

           ('Ben','Barnes','ben.barnes@comcast.net','17'), 

           ('Mischa ','Barton','mischa.barton@att.net','18'), 

           ('Sean','Bean','sean.bean@yahoo.com','16'), 

           ('Eliza','Bennett','eliza.bennett@yahoo.com','25'), 

           ('Michal','Krane','michal.Krane@me.com','25'), 

           ('Peter','Heaven','peter.heaven@google.com','23'), 

           ('Brian','Blessed','brian.blessed@yahoo.com','20'), 

           ('Kavin','Peterson','kavin.peterson@verizon.net','21');

 

select * from DuplicateRecords order by 2

 

/***************** Find Duplicate Records ******************/

-- 1st Method

;with cteD AS

       (

             select first_name,last_name,email,age

                           ,ROW_NUMBER() over (partition by first_name,last_name,email,age

                                        order by first_name,last_name,email,age) RNumber

             from DuplicateRecords

       )

select * from cteD

where RNumber>1

GO

-- 2nd Method

select A.* from DuplicateRecords A

JOIN DuplicateRecords B

       ON A.first_name=B.first_name

       AND A.last_name=B.last_name

       AND A.email=B.email

       AND A.age=B.age

       AND A.id>B.id

GO

-- 3rd Method

select A.* FROM DuplicateRecords A

WHERE

    id > (

        SELECT  min(id) FROM DuplicateRecords B

        WHERE A.first_name=B.first_name

                    AND A.last_name=B.last_name

                    AND A.email=B.email

                    AND A.age=B.age

    );

GO

-- 4th Method

SELECT first_name,last_name,email,age

FROM   DuplicateRecords

GROUP  BY first_name,last_name,email,age

HAVING COUNT(*) > 1

ORDER  BY COUNT(*) DESC

GO

-- 5th Method

select LU.*

FROM   (SELECT first_name,last_name,email,age,

               Row_number()

                 OVER (

                   partition BY first_name,last_name,email,age

                   ORDER BY id DESC) [Row]

        FROM   DuplicateRecords) LU

WHERE  [row] > 1

GO

-- 6th Method

select * FROM DuplicateRecords WHERE id IN

 (

  SELECT MAX(id) FROM DuplicateRecords

   GROUP BY  first_name,last_name,email,age HAVING COUNT(id)>1

 )

 

/***************** Delete Duplicate Records ******************/

-- 1st Method

;with cteD AS

       (

             select first_name,last_name,email,age

                           ,ROW_NUMBER() over (partition by first_name,last_name,email,age

                                        order by first_name,last_name,email,age) RNumber

             from DuplicateRecords

       )

delete from cteD

where RNumber>1

GO

-- 2nd Method

delete A from DuplicateRecords A

JOIN DuplicateRecords B

       ON A.first_name=B.first_name

       AND A.last_name=B.last_name

       AND A.email=B.email

       AND A.age=B.age

       AND A.id>B.id

GO

-- 3rd Method

delete A FROM DuplicateRecords A

WHERE

    id > (

        SELECT  min(id) FROM DuplicateRecords B

        WHERE A.first_name=B.first_name

                    AND A.last_name=B.last_name

                    AND A.email=B.email

                    AND A.age=B.age

    );

GO

-- 4th Method

delete LU

FROM   (SELECT first_name,last_name,email,age,

               Row_number()

                 OVER (

                   partition BY first_name,last_name,email,age

                   ORDER BY id DESC) [Row]

        FROM   DuplicateRecords) LU

WHERE  [row] > 1

GO

-- 5th Method

delete FROM DuplicateRecords WHERE id IN

 (

  SELECT MAX(id) FROM DuplicateRecords

   GROUP BY  first_name,last_name,email,age HAVING COUNT(id)>1

 )

GO

-- 6th Method

DELETE FROM DuplicateRecords

WHERE  ID NOT IN (SELECT MAX(ID)

                  FROM   DuplicateRecords

                  GROUP  BY first_name,last_name,email,age

                  HAVING MAX(ID) IS NOT NULL)

GO

Comments

  1. Really good information to show through this blog. I really appreciate you for all the valuable information that you are providing us through your blog.

    ReplyDelete

Post a Comment

Hi User,
Thanks for visiting My Blog and please provide your valuable feedback and subscribe for more updates. Please don't post any spam content or comments.
Thank You

Popular posts from this blog

SSRS INTERVIEW QUESTIONS

Q: What is SSRS? Ø   SSRS or SQL Server Reporting Service is a server-based report generation software systems from Microsoft and is part of Microsoft BI. Ø   It is used for preparing and delivering interactive and variety of reports. Ø   It is administered through an web based interface. Ø   Reporting services utilizes a web service interface for supporting and developing of customized reporting applications. Ø   SSRS lets you create very rich reports (Tabular/Graphical/Interactive) from various datasources with rich data visualization (Charts, Maps, sparklines) Ø   SSRS allows are reports to be exported in various formats (Excel, PDF, word etc) Q: Explain SSRS Architecture? Reporting services architecture comprises of integrated components. It is a multi-tiered, included with application, server and data layers. This architecture is scalable and modular. A single installation can be used across multiple computers. It includes the fo...

Exception deserializing the package "The process cannot access the file because it is being used by another process."

TITLE: Microsoft Visual Studio ------------------------------ Failed to start project ------------------------------ ADDITIONAL INFORMATION: Exception deserializing the package "The process cannot access the file 'E:\SSASCube\HistoricalDataLoad\HistoricalDataLoad\bin\Development\HistoricalDataLoad.ispac' because it is being used by another process.". (Microsoft.DataTransformationServices.VsIntegration) ------------------------------ The process cannot access the file 'E:\SSASCube\HistoricalDataLoad\HistoricalDataLoad\bin\Development\HistoricalDataLoad.ispac' because it is being used by another process. (mscorlib) ------------------------------ BUTTONS: OK ------------------------------ While running SSIS package i got the error “The process cannot access the file ‘*.ispac’ because it is being used by another process”. I tried to close SSDT and run it again but, I still got the same error while compiling. Then, after searching over internet, I got...

Failed to execute the package or element. Build errors were encountered

Error: TITLE: Microsoft Visual Studio ------------------------------ Failed to execute the package or element.   Build errors were encountered. For more information, see the Output window. ------------------------------ BUTTONS: OK ------------------------------   Solution: We tried to close SSDT and run it again but, we still got the same error while running SSIS package. Then, we need to follow bellow solution: Step 1: Go to Task Manager–> Details Tab. Step 2: Locate the process “ DtsDebugHost.exe “. Kill this process. There might be multiple instances of this process. Kill all of them. Step 3: Rerun SSIS package