add semicolon, change to uppercase (#448)

* add semicolon, change to uppercase * forgot to add colon * Smarter replacemt via regex * no need to replace hyphen * comment about periods
smucclaw · Oct 2, 2023 · 99f5093 · 99f5093
1 parent 8600a5b
commit 99f5093
Showing 1 changed file with 26 additions and 8 deletions.
diff --git a/lib/haskell/natural4/src/LS/XPile/LogicalEnglish/IdVars.hs b/lib/haskell/natural4/src/LS/XPile/LogicalEnglish/IdVars.hs
@@ -1,5 +1,7 @@
 {-# OPTIONS_GHC -W #-}
 
+{-# LANGUAGE BlockArguments #-}
+{-# LANGUAGE QuasiQuotes #-}
 {-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE DuplicateRecordFields, RecordWildCards #-}
 {-# LANGUAGE OverloadedStrings #-}
@@ -16,6 +18,7 @@ import Data.Coerce (coerce)
 import Data.HashSet qualified as HS
 import Data.Sequences (fromStrict, toStrict)
 import Data.Text qualified as T
+import Text.Regex.PCRE.Heavy qualified as PCRE
 import Text.Replace (Replace (Replace), listToTrie, replaceWithTrie)
 
 import LS.XPile.LogicalEnglish.Types
@@ -88,13 +91,13 @@ a config file that is kept in sync with the downstream stuff
 (since have to do this kind of replacement in the converse direction when generating justification)
 -}
 replaceTxt :: T.Text -> T.Text
-replaceTxt = toStrict . replaceWithTrie replacements . fromStrict
+replaceTxt =
+  replacePeriod . toStrict . replaceWithTrie replacements . fromStrict
   where
     replacements =
       listToTrie
-        [ Replace "," " comma",
-          Replace "." " dot ",
-          Replace "%" " percent" 
+        [ Replace "," " COMMA",
+          Replace "%" " PERCENT"
           {- ^ it's cleaner not to put a space after `percent`
            because it's usually something like "100% blah blah" in the encoding
            So if you add a space after, you end up getting "100 percent  blah blah", which doesn't look as nice.
@@ -105,17 +108,32 @@ replaceTxt = toStrict . replaceWithTrie replacements . fromStrict
             ""
 
             >>> replaceTxt ("100.5 * 2" :: T.Text)
-            "100 dot 5 * 2"
+            "100 DOT 5 * 2"
 
             >>> replaceTxt "100% guarantee"
-            "100 percent guarantee"
+            "100 PERCENT guarantee"
 
             >>> replaceTxt "rocks, stones, and trees"
-            "rocks comma stones comma and trees"
+            "rocks COMMA stones COMMA and trees"
           -}
         ]
-
 
+    -- LE has no trouble parsing dots that appear in numbers, ie things like
+    -- "clause 2.1 applies" is fine.
+    -- However, dots used as a full-stop, as in "The car is blue." is not ok
+    -- and so that "." needs to be turned into "PERIOD".
+    replacePeriod =
+      PCRE.gsub
+        -- https://stackoverflow.com/a/45616898 
+        [PCRE.re|[a-zA-z] + [^0-9\s.]+|\.(?!\d)|]
+        (" PERIOD " :: T.Text)
+
+    -- replaceHyphen =
+    --   PCRE.gsub
+    --     -- https://stackoverflow.com/a/31911114
+    --     [PCRE.re|(?=\S*[-])([a-zA-Z]+)\-([a-zA-Z]+)|]
+    --     \(s0:s1:_) -> mconcat [s0, " HYPHEN ", s1] :: T.Text
+
 {- | Convert a SimplifiedL4 Cell to a VCell
 The code for simplifying L4 AST has established these invariants:  
   * every IS NUM has had the IS removed, with the number converted to T.Text and wrapped in a MkCellIsNum