bluesky-social · quiple · Dec 18, 2024
diff --git a/automod/rules/keyword.go b/automod/rules/keyword.go
@@ -12,16 +12,10 @@ import (
 )
 
 func BadWordPostRule(c *automod.RecordContext, post *appbsky.FeedPost) error {
-	isJapanese := false
-	for _, lang := range post.Langs {
-		if lang == "ja" || strings.HasPrefix(lang, "ja-") {
-			isJapanese = true
-		}
-	}
 	for _, tok := range helpers.ExtractTextTokensPost(post) {
 		word := keyword.SlugIsExplicitSlur(tok)
 		// used very frequently in a reclaimed context
-		if word != "" && word != "faggot" && word != "tranny" && word != "coon" && !(word == "kike" && isJapanese) {
+		if word != "" && word != "faggot" && word != "tranny" && word != "coon" {
 			c.AddRecordFlag("bad-word-text")
 			c.ReportRecord(automod.ReportReasonRude, fmt.Sprintf("possible bad word in post text or alttext: %s", word))
 			//c.Notify("slack")
@@ -30,11 +24,6 @@ func BadWordPostRule(c *automod.RecordContext, post *appbsky.FeedPost) error {
 		// de-pluralize
 		tok = strings.TrimSuffix(tok, "s")
 		if c.InSet("worst-words", tok) {
-			// skip this specific term, if used in a Japanese language post
-			if isJapanese && tok == "kike" {
-				continue
-			}
-
 			c.AddRecordFlag("bad-word-text")
 			c.ReportRecord(automod.ReportReasonRude, fmt.Sprintf("possible bad word in post text or alttext: %s", tok))
 			//c.Notify("slack")

diff --git a/cmd/palomar/Dockerfile.opensearch b/cmd/palomar/Dockerfile.opensearch
@@ -1,3 +1,2 @@
 FROM opensearchproject/opensearch:2.13.0
 RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-icu
-RUN /usr/share/opensearch/bin/opensearch-plugin install --batch analysis-kuromoji
diff --git a/cmd/palomar/README.md b/cmd/palomar/README.md
@@ -64,7 +64,7 @@ Response:
 
 ## Development Quickstart
 
-Run an ephemeral opensearch instance on local port 9200, with SSL disabled, and the `analysis-icu` and `analysis-kuromoji` plugins installed, using docker:
+Run an ephemeral opensearch instance on local port 9200, with SSL disabled, and the `analysis-icu` plugin installed, using docker:
 
     docker build -f Dockerfile.opensearch . -t opensearch-palomar
 

diff --git a/cmd/palomar/README.opensearch.md b/cmd/palomar/README.opensearch.md
@@ -1,16 +1,14 @@
 
 # Basic OpenSearch Operations
 
-We use OpenSearch version 2.13+, with the `analysis-icu` and `analysis-kuromoji` plugins. These are included automatically on the AWS hosted version of Opensearch, otherwise you need to install:
+We use OpenSearch version 2.13+, with the `analysis-icu` plugin. These are included automatically on the AWS hosted version of Opensearch, otherwise you need to install:
 
     sudo /usr/share/opensearch/bin/opensearch-plugin install analysis-icu
-    sudo /usr/share/opensearch/bin/opensearch-plugin install analysis-kuromoji
     sudo service opensearch restart
 
 If you are trying to use Elasticsearch 7.10 instead of OpenSearch, you can install the plugin with:
 
     sudo /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-icu
-    sudo /usr/share/elasticsearch/bin/elasticsearch-plugin install analysis-kuromoji
     sudo service elasticsearch restart
 
 ## Local Development

diff --git a/search/japanese.go b/search/japanese.go
diff --git a/search/japanese_test.go b/search/japanese_test.go
diff --git a/search/post_schema.json b/search/post_schema.json
@@ -15,38 +15,20 @@
                     "type": "custom",
                     "tokenizer": "icu_tokenizer",
                     "char_filter": [ "icu_normalizer" ],
-                    "filter": [ "icu_folding" ]
-                },
-                "textIcuSearch": {
-                    "type": "custom",
-                    "tokenizer": "icu_tokenizer",
-                    "char_filter": [ "icu_normalizer" ],
-                    "filter": [ "icu_folding" ]
-                },
-                "textJapanese": {
-                    "type": "custom",
-                    "tokenizer": "kuromoji_tokenizer",
-                    "char_filter": [ "icu_normalizer" ],
                     "filter": [
-                        "kuromoji_baseform",
-                        "kuromoji_part_of_speech",
+                        "icu_folding",
                         "cjk_width",
-                        "ja_stop",
-                        "kuromoji_stemmer",
-                        "lowercase"
+                        "cjk_bigram"
                     ]
                 },
-                "textJapaneseSearch": {
+                "textIcuSearch": {
                     "type": "custom",
-                    "tokenizer": "kuromoji_tokenizer",
+                    "tokenizer": "icu_tokenizer",
                     "char_filter": [ "icu_normalizer" ],
                     "filter": [
-                        "kuromoji_baseform",
-                        "kuromoji_part_of_speech",
+                        "icu_folding",
                         "cjk_width",
-                        "ja_stop",
-                        "kuromoji_stemmer",
-                        "lowercase"
+                        "cjk_bigram"
                     ]
                 }
             },
@@ -75,15 +57,13 @@
 
         "created_at":     { "type": "date" },
         "text":           { "type": "text", "analyzer": "textIcu", "search_analyzer": "textIcuSearch", "copy_to": "everything" },
-        "text_ja":        { "type": "text", "analyzer": "textJapanese", "search_analyzer": "textJapaneseSearch", "copy_to": "everything_ja" },
         "lang_code":      { "type": "keyword", "normalizer": "default" },
         "lang_code_iso2": { "type": "keyword", "normalizer": "default" },
         "mention_did":    { "type": "keyword", "normalizer": "default" },
         "embed_aturi":    { "type": "keyword", "normalizer": "default" },
         "reply_root_aturi": { "type": "keyword", "normalizer": "default" },
         "embed_img_count": { "type": "integer" },
         "embed_img_alt_text": { "type": "text", "analyzer": "textIcu", "search_analyzer": "textIcuSearch", "copy_to": "everything" },
-        "embed_img_alt_text_ja": { "type": "text", "analyzer": "textJapanese", "search_analyzer": "textJapaneseSearch", "copy_to": "everything_ja" },
         "self_label":     { "type": "keyword", "normalizer": "default" },
 
         "url":            { "type": "keyword", "normalizer": "default" },
@@ -94,7 +74,6 @@
         "likesFuzzy":     { "type": "integer" },
 
         "everything":     { "type": "text", "analyzer": "textIcu", "search_analyzer": "textIcuSearch" },
-        "everything_ja":  { "type": "text", "analyzer": "textJapanese", "search_analyzer": "textJapaneseSearch" },
 
         "lang":           { "type": "alias", "path": "lang_code_iso2" }
     }

diff --git a/search/query.go b/search/query.go
@@ -224,9 +224,6 @@ func DoSearchPosts(ctx context.Context, dir identity.Directory, escli *es.Client
 	queryStringParams := ParsePostQuery(ctx, dir, params.Query, params.Viewer)
 	params.Update(&queryStringParams)
 	idx := "everything"
-	if containsJapanese(params.Query) {
-		idx = "everything_ja"
-	}
 	basic := map[string]interface{}{
 		"simple_query_string": map[string]interface{}{
 			"query":            params.Query,

diff --git a/search/query_test.go b/search/query_test.go
@@ -87,120 +87,6 @@ func testServer(ctx context.Context, t *testing.T, escli *es.Client, dir identit
 	return srv
 }
 
-func TestJapaneseRegressions(t *testing.T) {
-	assert := assert.New(t)
-	ctx := context.Background()
-	escli := testEsClient(t)
-	dir := identity.NewMockDirectory()
-	srv := testServer(ctx, t, escli, &dir)
-	ident := identity.Identity{
-		DID:    syntax.DID("did:plc:abc111"),
-		Handle: syntax.Handle("handle.example.com"),
-	}
-
-	res, err := DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(0, len(res.Hits.Hits))
-
-	p1 := appbsky.FeedPost{Text: "basic english post", CreatedAt: "2024-01-02T03:04:05.006Z"}
-	assert.NoError(srv.indexPost(ctx, &ident, &p1, "app.bsky.feed.post/3kpnillluoh2y", cid.Undef))
-
-	// https://github.com/bluesky-social/indigo/issues/302
-	p2 := appbsky.FeedPost{Text: "学校から帰って熱いお風呂に入ったら力一杯がんばる", CreatedAt: "2024-01-02T03:04:05.006Z"}
-	assert.NoError(srv.indexPost(ctx, &ident, &p2, "app.bsky.feed.post/3kpnillluo222", cid.Undef))
-	p3 := appbsky.FeedPost{Text: "熱力学", CreatedAt: "2024-01-02T03:04:05.006Z"}
-	assert.NoError(srv.indexPost(ctx, &ident, &p3, "app.bsky.feed.post/3kpnillluo333", cid.Undef))
-	p4 := appbsky.FeedPost{Text: "東京都", CreatedAt: "2024-01-02T03:04:05.006Z"}
-	assert.NoError(srv.indexPost(ctx, &ident, &p4, "app.bsky.feed.post/3kpnillluo444", cid.Undef))
-	p5 := appbsky.FeedPost{Text: "京都", CreatedAt: "2024-01-02T03:04:05.006Z"}
-	assert.NoError(srv.indexPost(ctx, &ident, &p5, "app.bsky.feed.post/3kpnillluo555", cid.Undef))
-	p6 := appbsky.FeedPost{Text: "パリ", CreatedAt: "2024-01-02T03:04:05.006Z"}
-	assert.NoError(srv.indexPost(ctx, &ident, &p6, "app.bsky.feed.post/3kpnillluo666", cid.Undef))
-	p7 := appbsky.FeedPost{Text: "ハリー・ポッター", CreatedAt: "2024-01-02T03:04:05.006Z"}
-	assert.NoError(srv.indexPost(ctx, &ident, &p7, "app.bsky.feed.post/3kpnillluo777", cid.Undef))
-	p8 := appbsky.FeedPost{Text: "ハリ", CreatedAt: "2024-01-02T03:04:05.006Z"}
-	assert.NoError(srv.indexPost(ctx, &ident, &p8, "app.bsky.feed.post/3kpnillluo223", cid.Undef))
-	p9 := appbsky.FeedPost{Text: "multilingual 多言語", CreatedAt: "2024-01-02T03:04:05.006Z"}
-	assert.NoError(srv.indexPost(ctx, &ident, &p9, "app.bsky.feed.post/3kpnillluo224", cid.Undef))
-
-	_, err = srv.escli.Indices.Refresh()
-	assert.NoError(err)
-
-	// expect all to be indexed
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "*", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(9, len(res.Hits.Hits))
-
-	// check that english matches (single post)
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "english", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(1, len(res.Hits.Hits))
-
-	// "thermodynamics"; should return only one match
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "熱力学", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(1, len(res.Hits.Hits))
-
-	// "Kyoto"; should return only one match
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "京都", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(1, len(res.Hits.Hits))
-
-	// "Paris"; should return only one match
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "パリ", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(1, len(res.Hits.Hits))
-
-	// should return only one match
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "ハリー", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(1, len(res.Hits.Hits))
-
-	// part of a word; should match none
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "ハ", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(0, len(res.Hits.Hits))
-
-	// should match both ways, and together
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "multilingual", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(1, len(res.Hits.Hits))
-
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "多言語", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(1, len(res.Hits.Hits))
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "multilingual 多言語", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(1, len(res.Hits.Hits))
-	res, err = DoSearchPosts(ctx, &dir, escli, testPostIndex, "\"multilingual 多言語\"", 0, 20)
-	if err != nil {
-		t.Fatal(err)
-	}
-	assert.Equal(1, len(res.Hits.Hits))
-}
-
 func TestParsedQuery(t *testing.T) {
 	assert := assert.New(t)
 	ctx := context.Background()

diff --git a/search/testdata/transform-post-fixtures.json b/search/testdata/transform-post-fixtures.json
@@ -266,14 +266,10 @@
 			"record_cid": "bafyreibjifzpqj6o6wcq3hejh7y4z4z2vmiklkvykc57tw3pcbx3kxifpm",
 			"created_at": "2023-08-07T05:46:14.423045Z",
 			"text": "学校から帰って熱いお風呂に入ったら力一杯がんばる",
-			"text_ja": "学校から帰って熱いお風呂に入ったら力一杯がんばる",
 			"embed_img_alt_text": [
 				"brief alt text description of the first image ハリー・ポッター",
 				"brief alt text description of the second image"
 			],
-			"embed_img_alt_text_ja": [
-				"brief alt text description of the first image ハリー・ポッター"
-			],
 			"embed_img_count": 2
 		}
 	},