-
Notifications
You must be signed in to change notification settings - Fork 0
/
killdupes.pl
152 lines (125 loc) · 4.87 KB
/
killdupes.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/perl
# little program to kill all the duplicates in a file and echo the result to standard output
my @file_array;
my $line_no = 0;
my $searched_line_no = 0;
my $filename = @ARGV[0];
my @searchresult;
my $lastchar;
my $curr_line;
my @unique_array;
my $curr_value;
my $counter = 0;
my %test_hash;
#my $option = @ARGV[1];
@input = @ARGV;
my $debug_option;
my $tws_option;
my $dupefile_option;
my $loud_option;
print STDERR "\n\n \t KILLDUPES by Batch McNulty (With thanks to Gabor Szabo) \n";
print STDERR "\n Finally you can properly kill duplicate lines in a text file";
print STDERR " without \n any nonsense about trailing whitespace or the wrong type of CR / LF.";
print STDERR "\n";
if (!@ARGV[0]) {
print STDERR "\n USAGE: killdupes filename.ext";
print STDERR "\n\tkilldupes filename.ext > output.txt";
print STDERR "\n";
print STDERR "\n Eliminates all duplicate lines in filename.ext and sends the results to ";
print STDERR "\n standard output, where you can redirect them to a file or do whatever you ";
print STDERR "\n like. It is more aggressive than sort -u or uniq because trailing whitespace ";
print STDERR "\n and mixed Windows/Linux style CRLFs are ignored.";
print STDERR "\n";
# These options are still in the program, but I didn't think they'd be any use to you.
# Feel free to uncomment 'em though.
# print STDERR "\n killdupes filename.ext -loud ";
# print STDERR "\n Also prints found duplicates to standard error (usually the screen).";
# print STDERR "\n";
# print STDERR "\n killdupes filename.txt -dupefile ";
# print STDERR "\n Also prints found duplicates to dupefile.txt.";
# print STDERR "\n";
# print STDERR "\n killdupes filename.txt -debug";
# print STDERR "\n Also prints debugging information to standard error (Implies -loud).";
# print STDERR "\n";
# print STDERR "\n killdupes filename.txt -ignoretws";
# print STDERR "\n Ignores trailing whitespace - like sort -u.";
print STDERR "\n This program is free, but if you want to give me money, my Bitcoin address is: ";
print STDERR "\n 1NYnGXRS4ZzNzmHu5Hsrqx169D7k7qBcYy " ;
die "\n\nThis program requires you to enter a filename as a rider\n\n";
}
print STDERR "\n Opening $filename for killdupe... \n";
@input_matches = grep { /-ignoretws/ } @input;
$tws_option = $input_matches[0];
@input_matches = grep { /-debug/ } @input;
$debug_option = $input_matches[0];
@input_matches = grep { /-dupefile/ } @input;
$dupefile_option = $input_matches[0];
@input_matches = grep { /-loud/ } @input;
$loud_option = $input_matches[0];
if ($tws_option eq "-ignoretws") {
print STDERR "Ignoring trailing whitespace (seeking duplicates less agressively)";
}
open (FH, $filename) or die "\n\n Looks like you pointed me to a file that doesn't exist or is corrupt.\n\n";
while (<FH>) {
$curr_line = $_;
chomp $curr_line;
chomp $curr_line;
unless ($tws_option eq "-ignoretws") {
$curr_line =~ s/\s+$//; # With thanks to Perlmaven.com's Gabor Szabo (https://perlmaven.com/trim)
}
@file_array[$line_no] = $curr_line;
$line_no ++;
}
$last_array_entry = $line_no;
$line_no = 0;
############ debugging ############
if ($debug_option eq "-debug") {
print STDERR "OK, so here's the file array:";
print STDERR "\n_____________________________________\n";
print STDERR @file_array;
print STDERR "\n";
print STDERR "Trailing whitespaces and cr/lfs have been removed.";
print STDERR "Now it's time to eliminate those duplicates";
print STDERR "lenght of file array:";
print STDERR $#file_array;
print STDERR "test hash (shld be empty):";
print STDERR join ",", keys %test_hash;
print STDERR ".";
print STDERR "\n\n About to process file array...\n";
}
################## /debugging ##############
# Removed trailing whitespace and cr /lf nonsense
# Now to remove duplicate lines!
######### Funny story - thanks to a programming error, I thought this code was faulty, #######
#### but it was my mistake. Fixed now ########
foreach my $curr_value (@file_array) {
if ($debug_option eq "-debug") {print STDERR "\n curr value:$curr_value.";} # debugging
if (! $test_hash{$curr_value}) {
push @unique_array, $curr_value;
$test_hash{$curr_value} = 1;
}
else {
if ($dupefile_option !~ "dupefile") {
print STDERR "\n DUPE FOUND! $curr_value.";# my mistake
}
elsif ($dupefile_option eq "-dupefile") {
print STDERR "Storing dupes in dupefile...";
open (FH, ">>dupefile.txt") or die "Shit! Couldn't open dupefile!";
printf (FH "$curr_value\n");
close (FH);
}
}
}
### debugging
if ($debug_option eq "-debug") {
print STDERR "\n\n Test hash:";
print STDERR join ",", keys %test_hash;
print STDERR "\n";
print STDERR "\n Unique array is now ready!\n";
}
### /debugging
# Credit for the above code is also due to Gabor Szabo
# https://perlmaven.com/unique-values-in-an-array-in-perl
print join "\n",@unique_array;
print STDERR "\n";
print STDERR "\n\n All done! Please Bitcoin me at: 1NYnGXRS4ZzNzmHu5Hsrqx169D7k7qBcYy \n\n";