Merge pull request #23 from TwiN/censor

Implement censor function
TwiN · Dec 1, 2021 · 58c7739 · 58c7739
2 parents c2f0cb8 + aeaecc8
commit 58c7739
Show file tree

Hide file tree

Showing 5 changed files with 206 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -7,34 +7,42 @@
 [![Go Reference](https://pkg.go.dev/badge/github.com/TwiN/go-away.svg)](https://pkg.go.dev/github.com/TwiN/go-away)
 [![Follow TwiN](https://img.shields.io/github/followers/TwiN?label=Follow&style=social)](https://github.com/TwiN)
 
-go-away is a stand-alone, lightweight library for detecting profanities in Go.
+go-away is a stand-alone, lightweight library for detecting and censoring profanities in Go.
 
 This library must remain **extremely** easy to use. Its original intent of not adding overhead will always remain.
 
 
 ## Installation
-```
+```console
 go get -u github.com/TwiN/go-away
 ```
 
 
 ## Usage
 ```go
+package main
+
 import (
-	"github.com/TwiN/go-away"
+    "github.com/TwiN/go-away"
 )
 
-goaway.IsProfane("fuck this shit")                // returns true
-goaway.ExtractProfanity("fuck this shit")         // returns "fuck"
-
-goaway.IsProfane("F   u   C  k th1$ $h!t")        // returns true
-goaway.ExtractProfanity("F   u   C  k th1$ $h!t") // returns "fuck"
-
-goaway.IsProfane("@$$h073")                       // returns true
-goaway.ExtractProfanity("@$$h073")                // returns "asshole"
-
-goaway.IsProfane("hello, world!")                 // returns false
-goaway.ExtractProfanity("hello, world!")          // returns ""
+func main() {
+    goaway.IsProfane("fuck this shit")                // returns true
+    goaway.ExtractProfanity("fuck this shit")         // returns "fuck"
+    goaway.Censor("fuck this shit")                   // returns "**** this ****"
+
+    goaway.IsProfane("F   u   C  k th1$ $h!t")        // returns true
+    goaway.ExtractProfanity("F   u   C  k th1$ $h!t") // returns "fuck"
+    goaway.Censor("F   u   C  k th1$ $h!t")           // returns "*   *   *  * th1$ ****"
+
+    goaway.IsProfane("@$$h073")                       // returns true
+    goaway.ExtractProfanity("@$$h073")                // returns "asshole"
+    goaway.Censor("@$$h073")                          // returns "*******"
+
+    goaway.IsProfane("hello, world!")                 // returns false
+    goaway.ExtractProfanity("hello, world!")          // returns ""
+    goaway.Censor("hello, world!")                    // returns "hello, world!"
+}
 ```
 
 Calling `goaway.IsProfane` and `goaway.ExtractProfanity` will use the default profanity detector, but if you'd like to 

diff --git a/falsepositives.go b/falsepositives.go
@@ -20,6 +20,7 @@ var DefaultFalsePositives = []string{
 	"circum",
 	"clitheroe",
 	"cockburn",
+	"cocktail",
 	"cumber",
 	"cumbing",
 	"cumulat",

diff --git a/goaway.go b/goaway.go
@@ -91,7 +91,7 @@ func (g *ProfanityDetector) IsProfane(s string) bool {
 // ExtractProfanity takes in a string (word or sentence) and look for profanities.
 // Returns the first profanity found, or an empty string if none are found
 func (g *ProfanityDetector) ExtractProfanity(s string) string {
-	s = g.sanitize(s)
+	s, _ = g.sanitize(s, false)
 	// Check for false negatives
 	for _, word := range g.falseNegatives {
 		if match := strings.Contains(s, word); match {
@@ -111,7 +111,56 @@ func (g *ProfanityDetector) ExtractProfanity(s string) string {
 	return ""
 }
 
-func (g ProfanityDetector) sanitize(s string) string {
+// Censor takes in a string (word or sentence) and tries to censor all profanities found.
+func (g *ProfanityDetector) Censor(s string) string {
+	censored := s
+	var originalIndexes []int
+	s, originalIndexes = g.sanitize(s, true)
+	// Check for false negatives
+	for _, word := range g.falseNegatives {
+		currentIndex := 0
+		for currentIndex != -1 {
+			if foundIndex := strings.Index(s[currentIndex:], word); foundIndex != -1 {
+				for i := 0; i < len(word); i++ {
+					censored = censored[:originalIndexes[foundIndex+currentIndex+i]] + "*" + censored[originalIndexes[foundIndex+currentIndex+i]+1:]
+				}
+				currentIndex += foundIndex + len(word)
+			} else {
+				break
+			}
+		}
+	}
+	// Remove false positives
+	for _, word := range g.falsePositives {
+		currentIndex := 0
+		for currentIndex != -1 {
+			if foundIndex := strings.Index(s[currentIndex:], word); foundIndex != -1 {
+				originalIndexes = append(originalIndexes[:foundIndex+currentIndex], originalIndexes[foundIndex+len(word):]...)
+				currentIndex += foundIndex + len(word)
+			} else {
+				break
+			}
+		}
+		s = strings.Replace(s, word, "", -1)
+	}
+	// Check for profanities
+	for _, word := range g.profanities {
+		currentIndex := 0
+		for currentIndex != -1 {
+			if foundIndex := strings.Index(s[currentIndex:], word); foundIndex != -1 {
+				for i := 0; i < len(word); i++ {
+					censored = censored[:originalIndexes[foundIndex+currentIndex+i]] + "*" + censored[originalIndexes[foundIndex+currentIndex+i]+1:]
+				}
+				currentIndex += foundIndex + len(word)
+			} else {
+				break
+			}
+		}
+	}
+	return censored
+}
+
+func (g ProfanityDetector) sanitize(s string, rememberOriginalIndexes bool) (string, []int) {
 	s = strings.ToLower(s)
 	if g.sanitizeLeetSpeak {
 		s = strings.Replace(s, "0", "o", -1)
@@ -129,22 +178,44 @@ func (g ProfanityDetector) sanitize(s string) string {
 			s = strings.Replace(s, "+", "t", -1)
 			s = strings.Replace(s, "$", "s", -1)
 			s = strings.Replace(s, "#", "h", -1)
-			s = strings.Replace(s, "()", "o", -1)
+			s = strings.Replace(s, "!", "i", -1)
+			if !rememberOriginalIndexes {
+				// Censor, which is the only function that sets rememberOriginalIndexes to true,
+				// does not support sanitizing '()' into 'o', because it's converting two characters,
+				// into a single character and that messes up with the character indexes. Unfortunately,
+				// I'm too sleepy to figure out how to fix it right now.
+				s = strings.Replace(s, "()", "o", -1)
+			}
+		} else {
+			s = strings.Replace(s, "@", " ", -1)
+			s = strings.Replace(s, "+", " ", -1)
+			s = strings.Replace(s, "$", " ", -1)
+			s = strings.Replace(s, "#", " ", -1)
+			s = strings.Replace(s, "(", " ", -1)
+			s = strings.Replace(s, ")", " ", -1)
+			s = strings.Replace(s, "!", " ", -1)
 		}
-		s = strings.Replace(s, "_", "", -1)
-		s = strings.Replace(s, "-", "", -1)
-		s = strings.Replace(s, "*", "", -1)
-		s = strings.Replace(s, "'", "", -1)
-		s = strings.Replace(s, "?", "", -1)
-		s = strings.Replace(s, "!", "", -1)
-	}
-	if g.sanitizeSpaces {
-		s = strings.Replace(s, space, "", -1)
+		s = strings.Replace(s, "_", " ", -1)
+		s = strings.Replace(s, "-", " ", -1)
+		s = strings.Replace(s, "*", " ", -1)
+		s = strings.Replace(s, "'", " ", -1)
+		s = strings.Replace(s, "?", " ", -1)
 	}
 	if g.sanitizeAccents {
 		s = removeAccents(s)
 	}
-	return s
+	var originalIndexes []int
+	if rememberOriginalIndexes {
+		for i, c := range s {
+			if c != ' ' {
+				originalIndexes = append(originalIndexes, i)
+			}
+		}
+	}
+	if g.sanitizeSpaces {
+		s = strings.Replace(s, space, "", -1)
+	}
+	return s, originalIndexes
 }
 
 // removeAccents strips all accents from characters.
@@ -183,3 +254,13 @@ func ExtractProfanity(s string) string {
 	}
 	return defaultProfanityDetector.ExtractProfanity(s)
 }
+
+// Censor takes in a string (word or sentence) and tries to censor all profanities found.
+//
+// Uses the default ProfanityDetector
+func Censor(s string) string {
+	if defaultProfanityDetector == nil {
+		defaultProfanityDetector = NewProfanityDetector()
+	}
+	return defaultProfanityDetector.Censor(s)
+}
diff --git a/goaway_bench_test.go b/goaway_bench_test.go
@@ -118,4 +118,11 @@ func BenchmarkProfanityDetector_Sanitize(b *testing.B) {
 		profanityDetector.IsProfane("H3ll0 J0hn D0e, 1 h0p3 y0u'r3 f3eling w3ll, as 1 c0me t0d4y b34r1ng sh1tty n3w5 r3g4rd1ng y0ur fav0rite ch0c0l4t3 chip c00kie br4nd")
 	}
 	b.ReportAllocs()
-}
+}
+
+func BenchmarkCensor(b *testing.B) {
+	for n := 0; n < b.N; n++ {
+		Censor("Thundercunt c()ck")
+	}
+	b.ReportAllocs()
+}
diff --git a/goaway_test.go b/goaway_test.go
@@ -37,6 +37,83 @@ func TestExtractProfanity(t *testing.T) {
 	}
 }
 
+func TestProfanityDetector_Censor(t *testing.T) {
+	defaultProfanityDetector = nil
+	tests := []struct {
+		input                  string
+		expectedCensoredOutput string
+	}{
+		{
+			input:                  "what the fuck",
+			expectedCensoredOutput: "what the ****",
+		},
+		{
+			input:                  "fuck this",
+			expectedCensoredOutput: "**** this",
+		},
+		{
+			input:                  "one penis, two vaginas, three dicks, four sluts, five whores and a flower",
+			expectedCensoredOutput: "one *****, two ******s, three ****s, four ****s, five *****s and a flower",
+		},
+		{
+			input:                  "Censor doesn't support sanitizing '()' into 'o', because it's two characters. Proof: c()ck. Maybe one day I'll have time to fix it.",
+			expectedCensoredOutput: "Censor doesn't support sanitizing '()' into 'o', because it's two characters. Proof: c()ck. Maybe one day I'll have time to fix it.",
+		},
+		{
+			input:                  "fuck shit fuck",
+			expectedCensoredOutput: "**** **** ****",
+		},
+		{
+			input:                  "fuckfuck",
+			expectedCensoredOutput: "********",
+		},
+		{
+			input:                  "fuck this shit",
+			expectedCensoredOutput: "**** this ****",
+		},
+		{
+			input:                  "F   u   C  k th1$ $h!t",
+			expectedCensoredOutput: "*   *   *  * th1$ ****",
+		},
+		{
+			input:                  "@$$h073",
+			expectedCensoredOutput: "*******",
+		},
+		{
+			input:                  "hello, world!",
+			expectedCensoredOutput: "hello, world!",
+		},
+		{
+			input:                  "Hey asshole, are y()u an assassin? If not, fuck off.",
+			expectedCensoredOutput: "Hey *******, are y()u an assassin? If not, **** off.",
+		},
+		{
+			input:                  "I am from Scunthorpe, north Lincolnshire",
+			expectedCensoredOutput: "I am from Scunthorpe, north Lincolnshire",
+		},
+		{
+			input:                  "He is an associate of mine",
+			expectedCensoredOutput: "He is an associate of mine",
+		},
+		{
+			input:                  "But the table is on fucking fire",
+			expectedCensoredOutput: "But the table is on ****ing fire",
+		},
+		{
+			input:                  "glass",
+			expectedCensoredOutput: "glass",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.input, func(t *testing.T) {
+			censored := Censor(tt.input)
+			if censored != tt.expectedCensoredOutput {
+				t.Errorf("expected '%s', got '%s'", tt.expectedCensoredOutput, censored)
+			}
+		})
+	}
+}
+
 func TestNoDuplicatesBetweenProfanitiesAndFalseNegatives(t *testing.T) {
 	for _, profanity := range DefaultProfanities {
 		for _, falseNegative := range DefaultFalseNegatives {
@@ -413,23 +490,23 @@ func TestSentencesFromTheAdventuresOfSherlockHolmes(t *testing.T) {
 
 func TestSanitize(t *testing.T) {
 	expectedString := "whatthefuckisyourproblem"
-	sanitizedString := NewProfanityDetector().sanitize("What the fu_ck is y()ur pr0bl3m?")
+	sanitizedString, _ := NewProfanityDetector().sanitize("What the fu_ck is y()ur pr0bl3m?", false)
 	if sanitizedString != expectedString {
 		t.Errorf("Expected '%s', got '%s'", expectedString, sanitizedString)
 	}
 }
 
 func TestSanitizeWithoutSanitizingSpecialCharacters(t *testing.T) {
 	expectedString := "whatthefu_ckisy()urproblem?"
-	sanitizedString := NewProfanityDetector().WithSanitizeSpecialCharacters(false).sanitize("What the fu_ck is y()ur pr0bl3m?")
+	sanitizedString, _ := NewProfanityDetector().WithSanitizeSpecialCharacters(false).sanitize("What the fu_ck is y()ur pr0bl3m?", false)
 	if sanitizedString != expectedString {
 		t.Errorf("Expected '%s', got '%s'", expectedString, sanitizedString)
 	}
 }
 
 func TestSanitizeWithoutSanitizingLeetSpeak(t *testing.T) {
-	expectedString := "whatthefuckisy()urpr0bl3m"
-	sanitizedString := NewProfanityDetector().WithSanitizeLeetSpeak(false).sanitize("What the fu_ck is y()ur pr0bl3m?")
+	expectedString := "whatthefuckisyurpr0bl3m"
+	sanitizedString, _ := NewProfanityDetector().WithSanitizeLeetSpeak(false).sanitize("What the fu_ck is y()ur pr0bl3m?", false)
 	if sanitizedString != expectedString {
 		t.Errorf("Expected '%s', got '%s'", expectedString, sanitizedString)
 	}