Skip to content

Commit

Permalink
Merge pull request #23 from TwiN/censor
Browse files Browse the repository at this point in the history
Implement censor function
  • Loading branch information
TwiN authored Dec 1, 2021
2 parents c2f0cb8 + aeaecc8 commit 58c7739
Show file tree
Hide file tree
Showing 5 changed files with 206 additions and 32 deletions.
36 changes: 22 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,34 +7,42 @@
[![Go Reference](https://pkg.go.dev/badge/github.com/TwiN/go-away.svg)](https://pkg.go.dev/github.com/TwiN/go-away)
[![Follow TwiN](https://img.shields.io/github/followers/TwiN?label=Follow&style=social)](https://github.com/TwiN)

go-away is a stand-alone, lightweight library for detecting profanities in Go.
go-away is a stand-alone, lightweight library for detecting and censoring profanities in Go.

This library must remain **extremely** easy to use. Its original intent of not adding overhead will always remain.


## Installation
```
```console
go get -u github.com/TwiN/go-away
```


## Usage
```go
package main

import (
"github.com/TwiN/go-away"
"github.com/TwiN/go-away"
)

goaway.IsProfane("fuck this shit") // returns true
goaway.ExtractProfanity("fuck this shit") // returns "fuck"

goaway.IsProfane("F u C k th1$ $h!t") // returns true
goaway.ExtractProfanity("F u C k th1$ $h!t") // returns "fuck"

goaway.IsProfane("@$$h073") // returns true
goaway.ExtractProfanity("@$$h073") // returns "asshole"

goaway.IsProfane("hello, world!") // returns false
goaway.ExtractProfanity("hello, world!") // returns ""
func main() {
goaway.IsProfane("fuck this shit") // returns true
goaway.ExtractProfanity("fuck this shit") // returns "fuck"
goaway.Censor("fuck this shit") // returns "**** this ****"

goaway.IsProfane("F u C k th1$ $h!t") // returns true
goaway.ExtractProfanity("F u C k th1$ $h!t") // returns "fuck"
goaway.Censor("F u C k th1$ $h!t") // returns "* * * * th1$ ****"

goaway.IsProfane("@$$h073") // returns true
goaway.ExtractProfanity("@$$h073") // returns "asshole"
goaway.Censor("@$$h073") // returns "*******"

goaway.IsProfane("hello, world!") // returns false
goaway.ExtractProfanity("hello, world!") // returns ""
goaway.Censor("hello, world!") // returns "hello, world!"
}
```

Calling `goaway.IsProfane` and `goaway.ExtractProfanity` will use the default profanity detector, but if you'd like to
Expand Down
1 change: 1 addition & 0 deletions falsepositives.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ var DefaultFalsePositives = []string{
"circum",
"clitheroe",
"cockburn",
"cocktail",
"cumber",
"cumbing",
"cumulat",
Expand Down
107 changes: 94 additions & 13 deletions goaway.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ func (g *ProfanityDetector) IsProfane(s string) bool {
// ExtractProfanity takes in a string (word or sentence) and look for profanities.
// Returns the first profanity found, or an empty string if none are found
func (g *ProfanityDetector) ExtractProfanity(s string) string {
s = g.sanitize(s)
s, _ = g.sanitize(s, false)
// Check for false negatives
for _, word := range g.falseNegatives {
if match := strings.Contains(s, word); match {
Expand All @@ -111,7 +111,56 @@ func (g *ProfanityDetector) ExtractProfanity(s string) string {
return ""
}

func (g ProfanityDetector) sanitize(s string) string {
// Censor takes in a string (word or sentence) and tries to censor all profanities found.
func (g *ProfanityDetector) Censor(s string) string {
censored := s
var originalIndexes []int
s, originalIndexes = g.sanitize(s, true)
// Check for false negatives
for _, word := range g.falseNegatives {
currentIndex := 0
for currentIndex != -1 {
if foundIndex := strings.Index(s[currentIndex:], word); foundIndex != -1 {
for i := 0; i < len(word); i++ {
censored = censored[:originalIndexes[foundIndex+currentIndex+i]] + "*" + censored[originalIndexes[foundIndex+currentIndex+i]+1:]
}
currentIndex += foundIndex + len(word)
} else {
break
}
}
}
// Remove false positives
for _, word := range g.falsePositives {
currentIndex := 0
for currentIndex != -1 {
if foundIndex := strings.Index(s[currentIndex:], word); foundIndex != -1 {
originalIndexes = append(originalIndexes[:foundIndex+currentIndex], originalIndexes[foundIndex+len(word):]...)
currentIndex += foundIndex + len(word)
} else {
break
}
}
s = strings.Replace(s, word, "", -1)
}
// Check for profanities
for _, word := range g.profanities {
currentIndex := 0
for currentIndex != -1 {
if foundIndex := strings.Index(s[currentIndex:], word); foundIndex != -1 {
for i := 0; i < len(word); i++ {
censored = censored[:originalIndexes[foundIndex+currentIndex+i]] + "*" + censored[originalIndexes[foundIndex+currentIndex+i]+1:]
}
currentIndex += foundIndex + len(word)
} else {
break
}
}
}
return censored
}

func (g ProfanityDetector) sanitize(s string, rememberOriginalIndexes bool) (string, []int) {
s = strings.ToLower(s)
if g.sanitizeLeetSpeak {
s = strings.Replace(s, "0", "o", -1)
Expand All @@ -129,22 +178,44 @@ func (g ProfanityDetector) sanitize(s string) string {
s = strings.Replace(s, "+", "t", -1)
s = strings.Replace(s, "$", "s", -1)
s = strings.Replace(s, "#", "h", -1)
s = strings.Replace(s, "()", "o", -1)
s = strings.Replace(s, "!", "i", -1)
if !rememberOriginalIndexes {
// Censor, which is the only function that sets rememberOriginalIndexes to true,
// does not support sanitizing '()' into 'o', because it's converting two characters,
// into a single character and that messes up with the character indexes. Unfortunately,
// I'm too sleepy to figure out how to fix it right now.
s = strings.Replace(s, "()", "o", -1)
}
} else {
s = strings.Replace(s, "@", " ", -1)
s = strings.Replace(s, "+", " ", -1)
s = strings.Replace(s, "$", " ", -1)
s = strings.Replace(s, "#", " ", -1)
s = strings.Replace(s, "(", " ", -1)
s = strings.Replace(s, ")", " ", -1)
s = strings.Replace(s, "!", " ", -1)
}
s = strings.Replace(s, "_", "", -1)
s = strings.Replace(s, "-", "", -1)
s = strings.Replace(s, "*", "", -1)
s = strings.Replace(s, "'", "", -1)
s = strings.Replace(s, "?", "", -1)
s = strings.Replace(s, "!", "", -1)
}
if g.sanitizeSpaces {
s = strings.Replace(s, space, "", -1)
s = strings.Replace(s, "_", " ", -1)
s = strings.Replace(s, "-", " ", -1)
s = strings.Replace(s, "*", " ", -1)
s = strings.Replace(s, "'", " ", -1)
s = strings.Replace(s, "?", " ", -1)
}
if g.sanitizeAccents {
s = removeAccents(s)
}
return s
var originalIndexes []int
if rememberOriginalIndexes {
for i, c := range s {
if c != ' ' {
originalIndexes = append(originalIndexes, i)
}
}
}
if g.sanitizeSpaces {
s = strings.Replace(s, space, "", -1)
}
return s, originalIndexes
}

// removeAccents strips all accents from characters.
Expand Down Expand Up @@ -183,3 +254,13 @@ func ExtractProfanity(s string) string {
}
return defaultProfanityDetector.ExtractProfanity(s)
}

// Censor takes in a string (word or sentence) and tries to censor all profanities found.
//
// Uses the default ProfanityDetector
func Censor(s string) string {
if defaultProfanityDetector == nil {
defaultProfanityDetector = NewProfanityDetector()
}
return defaultProfanityDetector.Censor(s)
}
9 changes: 8 additions & 1 deletion goaway_bench_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,11 @@ func BenchmarkProfanityDetector_Sanitize(b *testing.B) {
profanityDetector.IsProfane("H3ll0 J0hn D0e, 1 h0p3 y0u'r3 f3eling w3ll, as 1 c0me t0d4y b34r1ng sh1tty n3w5 r3g4rd1ng y0ur fav0rite ch0c0l4t3 chip c00kie br4nd")
}
b.ReportAllocs()
}
}

func BenchmarkCensor(b *testing.B) {
for n := 0; n < b.N; n++ {
Censor("Thundercunt c()ck")
}
b.ReportAllocs()
}
85 changes: 81 additions & 4 deletions goaway_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,83 @@ func TestExtractProfanity(t *testing.T) {
}
}

func TestProfanityDetector_Censor(t *testing.T) {
defaultProfanityDetector = nil
tests := []struct {
input string
expectedCensoredOutput string
}{
{
input: "what the fuck",
expectedCensoredOutput: "what the ****",
},
{
input: "fuck this",
expectedCensoredOutput: "**** this",
},
{
input: "one penis, two vaginas, three dicks, four sluts, five whores and a flower",
expectedCensoredOutput: "one *****, two ******s, three ****s, four ****s, five *****s and a flower",
},
{
input: "Censor doesn't support sanitizing '()' into 'o', because it's two characters. Proof: c()ck. Maybe one day I'll have time to fix it.",
expectedCensoredOutput: "Censor doesn't support sanitizing '()' into 'o', because it's two characters. Proof: c()ck. Maybe one day I'll have time to fix it.",
},
{
input: "fuck shit fuck",
expectedCensoredOutput: "**** **** ****",
},
{
input: "fuckfuck",
expectedCensoredOutput: "********",
},
{
input: "fuck this shit",
expectedCensoredOutput: "**** this ****",
},
{
input: "F u C k th1$ $h!t",
expectedCensoredOutput: "* * * * th1$ ****",
},
{
input: "@$$h073",
expectedCensoredOutput: "*******",
},
{
input: "hello, world!",
expectedCensoredOutput: "hello, world!",
},
{
input: "Hey asshole, are y()u an assassin? If not, fuck off.",
expectedCensoredOutput: "Hey *******, are y()u an assassin? If not, **** off.",
},
{
input: "I am from Scunthorpe, north Lincolnshire",
expectedCensoredOutput: "I am from Scunthorpe, north Lincolnshire",
},
{
input: "He is an associate of mine",
expectedCensoredOutput: "He is an associate of mine",
},
{
input: "But the table is on fucking fire",
expectedCensoredOutput: "But the table is on ****ing fire",
},
{
input: "glass",
expectedCensoredOutput: "glass",
},
}
for _, tt := range tests {
t.Run(tt.input, func(t *testing.T) {
censored := Censor(tt.input)
if censored != tt.expectedCensoredOutput {
t.Errorf("expected '%s', got '%s'", tt.expectedCensoredOutput, censored)
}
})
}
}

func TestNoDuplicatesBetweenProfanitiesAndFalseNegatives(t *testing.T) {
for _, profanity := range DefaultProfanities {
for _, falseNegative := range DefaultFalseNegatives {
Expand Down Expand Up @@ -413,23 +490,23 @@ func TestSentencesFromTheAdventuresOfSherlockHolmes(t *testing.T) {

func TestSanitize(t *testing.T) {
expectedString := "whatthefuckisyourproblem"
sanitizedString := NewProfanityDetector().sanitize("What the fu_ck is y()ur pr0bl3m?")
sanitizedString, _ := NewProfanityDetector().sanitize("What the fu_ck is y()ur pr0bl3m?", false)
if sanitizedString != expectedString {
t.Errorf("Expected '%s', got '%s'", expectedString, sanitizedString)
}
}

func TestSanitizeWithoutSanitizingSpecialCharacters(t *testing.T) {
expectedString := "whatthefu_ckisy()urproblem?"
sanitizedString := NewProfanityDetector().WithSanitizeSpecialCharacters(false).sanitize("What the fu_ck is y()ur pr0bl3m?")
sanitizedString, _ := NewProfanityDetector().WithSanitizeSpecialCharacters(false).sanitize("What the fu_ck is y()ur pr0bl3m?", false)
if sanitizedString != expectedString {
t.Errorf("Expected '%s', got '%s'", expectedString, sanitizedString)
}
}

func TestSanitizeWithoutSanitizingLeetSpeak(t *testing.T) {
expectedString := "whatthefuckisy()urpr0bl3m"
sanitizedString := NewProfanityDetector().WithSanitizeLeetSpeak(false).sanitize("What the fu_ck is y()ur pr0bl3m?")
expectedString := "whatthefuckisyurpr0bl3m"
sanitizedString, _ := NewProfanityDetector().WithSanitizeLeetSpeak(false).sanitize("What the fu_ck is y()ur pr0bl3m?", false)
if sanitizedString != expectedString {
t.Errorf("Expected '%s', got '%s'", expectedString, sanitizedString)
}
Expand Down

0 comments on commit 58c7739

Please sign in to comment.