COM*******************************************************************************
COM This is an interactive script, so the first thing it does is uses dialog boxes to identify the tables and fields needed in the
COM script.
COM*******************************************************************************
set safety off
DIALOG (DIALOG TITLE "User Dialog" WIDTH 472 HEIGHT 277 ) (BUTTONSET TITLE "&OK;&Cancel" AT 370 12 DEFAULT 1 ) (TEXT TITLE "Q-Gram test for duplicates" AT 48 40 WIDTH 164 HEIGHT 32 ) (TEXT TITLE "Opens file, Removes special character for key field, Sorts file, Performs segment by segment comparision of consecutive records." AT 84 88 WIDTH 312 HEIGHT 46 ) (TEXT TITLE "Identifies records if the number of matching different segments is >= threshold value." AT 84 160 WIDTH 312 HEIGHT 29 ) (TEXT TITLE "cc Porter Broyles shared under the GFDL" AT 276 220 WIDTH 144 HEIGHT 29 )
DIALOG (DIALOG TITLE "User Dialog" WIDTH 484 HEIGHT 347 ) (BUTTONSET TITLE "&OK;&Cancel" AT 360 24 DEFAULT 1 ) (TEXT TITLE "Fuzy logic test for duplicates" AT 24 40 WIDTH 147 HEIGHT 24 ) (TEXT TITLE "Select table to open" AT 24 76 ) (ITEM TITLE "f" TO "v_infile" AT 120 120 WIDTH 205 HEIGHT 130 )
COM*******************************************************************************
COM Deleting the user defined fields, to ensure a clean slate.
COM*******************************************************************************
open %v_infile%
DELETE Dup_test1 OK
DELETE Dup_test2 OK
DELETE Fuzzy_Dup.fil OK
DELETE format Fuzzy_Dup OK
DELETE Fuzzy_TEMP.fil OK
DELETE format Fuzzy_TEMP OK
DELETE Fuzzy_TEMP2.fil OK
DELETE format Fuzzy_TEMP2 OK
DELETE TEMP.fil OK
DELETE format TEMP OK
DELETE Temp1 OK
DELETE List OK
DIALOG (DIALOG TITLE "User Dialog" WIDTH 501 HEIGHT 495 ) (BUTTONSET TITLE "&OK;&Cancel" AT 370 12 DEFAULT 1 ) (TEXT TITLE "Select field to test for duplicates" AT 24 184 WIDTH 163 ) (ITEM TITLE "CND" TO "v_dup_field" AT 24 228 WIDTH 202 HEIGHT 130 ) (TEXT TITLE "Set threshold percent" AT 24 52 WIDTH 184 HEIGHT 51 ) (EDIT TO "v_threshold" AT 240 72 WIDTH 39 HEIGHT 24 DEFAULT "85" ) (TEXT TITLE "Select field - to list if duplicate record" AT 252 184 WIDTH 180 HEIGHT 31 ) (ITEM TITLE "CND" TO "v_list" AT 240 228 WIDTH 223 HEIGHT 133 ) (TEXT TITLE "Select Length of Characters to in Each Segment" AT 24 124 ) (EDIT TO "V_string" AT 348 132 WIDTH 23 DEFAULT "2" ) (TEXT TITLE "Select how close the segments must occur to one another:" AT 12 292 ) (EDIT TO "V_Distance" AT 420 288 WIDTH 39 HEIGHT 21 DEFAULT "3" ) (TEXT TITLE "Note: Smaller numbers will capture a tighter universe, but larger numbers will capture more potentials (and false positives.)" AT 12 328 WIDTH 453 HEIGHT 42 ) (TEXT TITLE "For example, a small value will not match up the 'CO' in Colorado with the 'CO' in The Microsoft Corporation." AT 12 376 WIDTH 454 HEIGHT 38 ) (TEXT TITLE "A small value may not capture 'The University of South Carolina Department of Psychology' as a duplicate to 'The University of South Carolina Psychology Department.'" AT 12 424 WIDTH 460 HEIGHT 52 )
COM*******************************************************************************
COM The dialog boxes allow users to select numeric or date fields, but the code is written for ASCII fields.
COM By using the FTYPE command we ensure that the field type is ASCII to ensure the script works.
COM the v_string,v_threshold, and v_distance variables are used as part of the q-gram process.
COM*******************************************************************************
v_list_len = Length(v_list)
v_dup_len = Length(v_dup_field)
V_string = value(V_string,0)
v_threshold = value(v_threshold,2)/100
V_distance= value(V_distance,0)
IF FTYPE("%v_dup_field%")="N" DEFINE FIELD Temp1 COMPUTED STRING(%v_dup_field%,v_dup_len)
IF FTYPE("%v_list%")="N" DEFINE FIELD List COMPUTED STRING(%v_list%,v_list_len)
IF FTYPE("%v_dup_field%")="C" DEFINE FIELD Temp1 COMPUTED (%v_dup_field%)
IF FTYPE("%v_list%")="C" DEFINE FIELD List COMPUTED (%v_list%)
IF FTYPE("%v_dup_field%")="D" DEFINE FIELD Temp1 COMPUTED DATE(%v_dup_field%)
IF FTYPE("%v_list%")="D" DEFINE FIELD List COMPUTED DATE(%v_list%)
DEFINE field Dup_test1 Computed INCLUDE(UPPER(Temp1),'QWERTYUIOPLKJHGFDSAZXCVBNM1234567890')
COM*******************************************************************************
COM sorting on the dup_test1 field to get similar items close together and extracting values to TEMP_1
COM this creates a new table to run the analysis on that contains the data from the current and subsequent lines.
COM*******************************************************************************
SORT on Dup_test1 to temp OPEN
Dup_test2 = blank(50)
List2 = blank(50)
%v_dup_field%v2 = blank(50)
GROUP
Extract %v_dup_field% %v_dup_field%v2 Dup_test1 list Dup_test2 list2 to Temp_1
Dup_test2 = Dup_test1
list2 = list
%v_dup_field%v2 = %v_dup_field%
END
COM*******************************************************************************
COM The Summary minimizes the number of duplicates thus improving efficeincy of the GROUP-LOOP statement
COM*******************************************************************************
OPEN Temp_1
Summarize ON Dup_test1 Dup_test2 OTHER List List2 %v_dup_field% %v_dup_field%v2 to Temp_2
COM*******************************************************************************
COM The group Loop statement will compare the Dup_test1 and Dup_test2 fields.
COM This script utilizes imbedded loops (EG Loops within Loops) to compare fields looking for
COM strings that are v_string in length.
COM*******************************************************************************
OPEN Temp_2
loop_max=MAX(LEN(alltrim(Dup_test1)),LEN(alltrim(Dup_test2))) - v_string + 1
loop_ctr=1
Loop_ctr2=1
Ldup_key=0
V_Counter=0
Group
LOOP while loop_ctr<=loop_max
Compare_1 = substr(dup_test1,loop_ctr,V_string)
Loop while loop_ctr2<=loop_max AND V_Counter = 0
V_Counter=0
V_Counter= 1 if compare_1=substr(dup_test2,loop_ctr2,v_string)AND COMPARE_1<>" " AND BETWEEN(loop_ctr,Loop_ctr2-v_distance,Loop_ctr2+v_distance)
Extract %v_dup_field% %v_dup_field%v2 Dup_test1 Dup_test2 list list2 LDup_key V_Counter Loop_ctr loop_ctr2 IF V_Counter = 1 to "Fuzzy Temp"
Loop_ctr2=loop_ctr2+1
end
loop_ctr =loop_ctr+1
LDup_key=Ldup_key+V_Counter
V_Counter=0
Loop_ctr2=1
end
Ldup_key=0
loop_ctr=1
loop_ctr2=1
loop_max=MAX(LEN(alltrim(Dup_test1)),LEN(alltrim(Dup_test2))) - v_string + 1
end
OPEN Fuzzy_temp
DELETE THRESHOLD OK
DEFINE FIELD THRESHOLD COMPUTED v_threshold*(MAX(LEN(alltrim(Dup_test1)),LEN(alltrim(Dup_test2)))-v_string+1)
Summarize on Dup_test1 Dup_test2 OTHER %v_dup_field% %v_dup_field%v2 list list2 threshold Acc V_Counter to Fuzzy_temp2 PRESORT OPEN
EXTRACT %v_dup_field% %v_dup_field%v2 list as "%v_list%" list2 as "%v_list%2" V_Counter threshold if V_Counter>=threshold AND %v_dup_field% <> %v_dup_field%v2 to Fuzzy_Q_Dup_temp OPEN
open %v_infile%
DELETE Dup_test1 OK
DELETE Temp_1.fil OK
DELETE FORMAT Temp_1 OK
DELETE Temp_2.fil OK
DELETE FORMAT Temp_2 OK
DELETE Fuzzy_Temp.fil OK
DELETE FORMAT Fuzzy_temp OK
DELETE Fuzzy_Temp2.fil OK
DELETE FORMAT Fuzzy_temp2 OK
DEFINE field Dup_test1 Computed ALLTRIM(REVERSE(INCLUDE(UPPER(Temp1),'QWERTYUIOPLKJHGFDSAZXCVBNM1234567890')))
COM*******************************************************************************
COM The Dup_test1 field is reversed and the script is run again, this helps to ensure that
COM values not captured before are identified.
COM*******************************************************************************
SORT on Dup_test1 to temp OPEN
Dup_test2 = blank(50)
List2 = blank(50)
%v_dup_field%v2 = blank(50)
GROUP
Extract %v_dup_field% %v_dup_field%v2 Dup_test1 list Dup_test2 list2 to Temp_1
Dup_test2 = Dup_test1
list2 = list
%v_dup_field%v2 = %v_dup_field%
END
OPEN Temp_1
Summarize ON Dup_test1 Dup_test2 OTHER List List2 %v_dup_field% %v_dup_field%v2 to Temp_2
OPEN Temp_2
loop_max=MAX(LEN(alltrim(Dup_test1)),LEN(alltrim(Dup_test2))) - v_string + 1
loop_ctr=1
Loop_ctr2=1
Ldup_key=0
V_Counter=0
Group
LOOP while loop_ctr<=loop_max
Compare_1 = substr(dup_test1,loop_ctr,V_string)
Loop while loop_ctr2<=loop_max AND V_Counter = 0
V_Counter=0
V_Counter= 1 if compare_1=substr(dup_test2,loop_ctr2,v_string)AND COMPARE_1<>" " AND BETWEEN(loop_ctr,Loop_ctr2-v_distance,Loop_ctr2+v_distance)
Extract %v_dup_field% %v_dup_field%v2 Dup_test1 Dup_test2 list list2 LDup_key V_Counter Loop_ctr loop_ctr2 IF V_Counter = 1 to "Fuzzy Temp"
Loop_ctr2=loop_ctr2+1
end
loop_ctr =loop_ctr+1
LDup_key=Ldup_key+V_Counter
V_Counter=0
Loop_ctr2=1
end
Ldup_key=0
loop_ctr=1
loop_ctr2=1
loop_max=MAX(LEN(alltrim(Dup_test1)),LEN(alltrim(Dup_test2))) - v_string + 1
end
COM*******************************************************************************
COM identifying records to be preserved in final report.
COM*******************************************************************************
OPEN Fuzzy_temp
DELETE THRESHOLD OK
DEFINE FIELD THRESHOLD COMPUTED v_threshold*(MAX(LEN(alltrim(Dup_test1)),LEN(alltrim(Dup_test2)))-v_string+1)
Summarize on Dup_test1 Dup_test2 OTHER %v_dup_field% %v_dup_field%v2 list list2 threshold Acc V_Counter to Fuzzy_temp2 PRESORT OPEN
EXTRACT %v_dup_field% %v_dup_field%v2 list as "%v_list%" list2 as "%v_list%2" V_Counter threshold if V_Counter>=threshold AND %v_dup_field% <> %v_dup_field%v2 to Fuzzy_Q_Dup_temp OPEN APPEND
Summarize on %v_dup_field% %v_dup_field%v2 OTHER %v_list% %v_list%2 V_Counter threshold TO Fuzzy_Q_Dupe OPEN PRESORT
COM*******************************************************************************
COM cleaning up unnecessary files and fields.
COM*******************************************************************************
open %v_infile%
DELETE Fuzzy_Q_Dup_temp.fil OK
DELETE FORMAT Fuzzy_Q_Dup_temp OK
DELETE Dup_test1 OK
DELETE Dup_test2 OK
DELETE v_list OK
DELETE v_threshold OK
DELETE v_dup_field OK
DELETE v_infile OK
DELETE loop_Ctr OK
DELETE loop_max OK
DELETE Fuzzy_TEMP.fil OK
DELETE format Fuzzy_TEMP OK
DELETE Fuzzy_TEMP2.fil OK
DELETE format Fuzzy_TEMP2 OK
DELETE TEMP.fil OK
DELETE format TEMP OK
DELETE Temp_1.fil OK
DELETE format Temp_1 OK
DELETE Temp_2.fil OK
DELETE format Temp_2 OK
OPEN Fuzzy_Q_Dupe
set safety on