From 4bf1c0450c93efbf02bfbc21075cad25096e5dbb Mon Sep 17 00:00:00 2001
From: Ben Wiederhake <BenWiederhake.GitHub@gmx.de>
Date: Fri, 24 May 2024 00:14:45 +0200
Subject: [PATCH] Split messages if necessary (discord) (#2124)

* Implement and test byte-splitting helper function

* Implement discord botuser message splitting

* Implement discord webhooks message splitting
---
 bridge/config/config.go      |   1 +
 bridge/discord/discord.go    |  59 ++++++++++-----
 bridge/discord/webhook.go    | 143 ++++++++++++++++++++---------------
 bridge/helper/helper.go      |  27 +++++++
 bridge/helper/helper_test.go | 102 +++++++++++++++++++++++++
 matterbridge.toml.sample     |  11 ++-
 6 files changed, 262 insertions(+), 81 deletions(-)

diff --git a/bridge/config/config.go b/bridge/config/config.go
index 18c6092082..75792ed0c3 100644
--- a/bridge/config/config.go
+++ b/bridge/config/config.go
@@ -121,6 +121,7 @@ type Protocol struct {
 	MessageLength          int        // IRC, max length of a message allowed
 	MessageQueue           int        // IRC, size of message queue for flood control
 	MessageSplit           bool       // IRC, split long messages with newlines on MessageLength instead of clipping
+	MessageSplitMaxCount   int        // discord, split long messages into at most this many messages instead of clipping (MessageLength=1950 cannot be configured)
 	Muc                    string     // xmpp
 	MxID                   string     // matrix
 	Name                   string     // all protocols
diff --git a/bridge/discord/discord.go b/bridge/discord/discord.go
index 9bac021ad5..2707ad2086 100644
--- a/bridge/discord/discord.go
+++ b/bridge/discord/discord.go
@@ -316,6 +316,7 @@ func (b *Bdiscord) handleEventBotUser(msg *config.Message, channelID string) (st
 	// Upload a file if it exists
 	if msg.Extra != nil {
 		for _, rmsg := range helper.HandleExtra(msg, b.General) {
+			// TODO: Use ClipOrSplitMessage
 			rmsg.Text = helper.ClipMessage(rmsg.Text, MessageLength, b.GetString("MessageClipped"))
 			if _, err := b.c.ChannelMessageSend(channelID, rmsg.Username+rmsg.Text); err != nil {
 				b.Log.Errorf("Could not send message %#v: %s", rmsg, err)
@@ -327,35 +328,53 @@ func (b *Bdiscord) handleEventBotUser(msg *config.Message, channelID string) (st
 		}
 	}
 
-	msg.Text = helper.ClipMessage(msg.Text, MessageLength, b.GetString("MessageClipped"))
-	msg.Text = b.replaceUserMentions(msg.Text)
-
 	// Edit message
 	if msg.ID != "" {
-		_, err := b.c.ChannelMessageEdit(channelID, msg.ID, msg.Username+msg.Text)
-		return msg.ID, err
+		// Exploit that a discord message ID is actually just a large number, and we encode a list of IDs by separating them with ";".
+		var msgIds = strings.Split(msg.ID, ";")
+		msgParts := helper.ClipOrSplitMessage(b.replaceUserMentions(msg.Text), MessageLength, b.GetString("MessageClipped"), len(msgIds))
+		for len(msgParts) < len(msgIds) {
+			msgParts = append(msgParts, "((obsoleted by edit))")
+		}
+		for i := range msgParts {
+			// In case of split-messages where some parts remain the same (i.e. only a typo-fix in a huge message), this causes some noop-updates.
+			// TODO: Optimize away noop-updates of un-edited messages
+			// TODO: Use RemoteNickFormat instead of this broken concatenation
+			_, err := b.c.ChannelMessageEdit(channelID, msgIds[i], msg.Username+msgParts[i])
+			if err != nil {
+				return "", err
+			}
+		}
+		return msg.ID, nil
 	}
 
-	m := discordgo.MessageSend{
-		Content:         msg.Username + msg.Text,
-		AllowedMentions: b.getAllowedMentions(),
-	}
+	msgParts := helper.ClipOrSplitMessage(b.replaceUserMentions(msg.Text), MessageLength, b.GetString("MessageClipped"), b.GetInt("MessageSplitMaxCount"))
+	var msgIds = []string{}
 
-	if msg.ParentValid() {
-		m.Reference = &discordgo.MessageReference{
-			MessageID: msg.ParentID,
-			ChannelID: channelID,
-			GuildID:   b.guildID,
+	for _, msgPart := range msgParts {
+		m := discordgo.MessageSend{
+			Content:         msg.Username + msgPart,
+			AllowedMentions: b.getAllowedMentions(),
 		}
-	}
 
-	// Post normal message
-	res, err := b.c.ChannelMessageSendComplex(channelID, &m)
-	if err != nil {
-		return "", err
+		if msg.ParentValid() {
+			m.Reference = &discordgo.MessageReference{
+				MessageID: msg.ParentID,
+				ChannelID: channelID,
+				GuildID:   b.guildID,
+			}
+		}
+
+		// Post normal message
+		res, err := b.c.ChannelMessageSendComplex(channelID, &m)
+		if err != nil {
+			return "", err
+		}
+		msgIds = append(msgIds, res.ID)
 	}
 
-	return res.ID, nil
+	// Exploit that a discord message ID is actually just a large number, so we encode a list of IDs by separating them with ";".
+	return strings.Join(msgIds, ";"), nil
 }
 
 // handleUploadFile handles native upload of files
diff --git a/bridge/discord/webhook.go b/bridge/discord/webhook.go
index b518ea6262..4e647b3856 100644
--- a/bridge/discord/webhook.go
+++ b/bridge/discord/webhook.go
@@ -2,6 +2,7 @@ package bdiscord
 
 import (
 	"bytes"
+	"strings"
 
 	"github.com/42wim/matterbridge/bridge/config"
 	"github.com/42wim/matterbridge/bridge/helper"
@@ -42,13 +43,65 @@ func (b *Bdiscord) maybeGetLocalAvatar(msg *config.Message) string {
 	return ""
 }
 
+func (b *Bdiscord) webhookSendTextOnly(msg *config.Message, channelID string) (string, error) {
+	msgParts := helper.ClipOrSplitMessage(msg.Text, MessageLength, b.GetString("MessageClipped"), b.GetInt("MessageSplitMaxCount"))
+	var msgIds = []string{}
+	for _, msgPart := range msgParts {
+		res, err := b.transmitter.Send(
+			channelID,
+			&discordgo.WebhookParams{
+				Content:         msgPart,
+				Username:        msg.Username,
+				AvatarURL:       msg.Avatar,
+				AllowedMentions: b.getAllowedMentions(),
+			},
+		)
+		if err != nil {
+			return "", err
+		} else {
+			msgIds = append(msgIds, res.ID)
+		}
+	}
+	// Exploit that a discord message ID is actually just a large number, so we encode a list of IDs by separating them with ";".
+	return strings.Join(msgIds, ";"), nil
+}
+
+func (b *Bdiscord) webhookSendFilesOnly(msg *config.Message, channelID string) error {
+	for _, f := range msg.Extra["file"] {
+		fi := f.(config.FileInfo)
+		file := discordgo.File{
+			Name:        fi.Name,
+			ContentType: "",
+			Reader:      bytes.NewReader(*fi.Data),
+		}
+		content := fi.Comment
+
+		// Cannot use the resulting ID for any edits anyway, so throw it away.
+		// This has to be re-enabled when we implement message deletion.
+		_, err := b.transmitter.Send(
+			channelID,
+			&discordgo.WebhookParams{
+				Username:        msg.Username,
+				AvatarURL:       msg.Avatar,
+				Files:           []*discordgo.File{&file},
+				Content:         content,
+				AllowedMentions: b.getAllowedMentions(),
+			},
+		)
+		if err != nil {
+			b.Log.Errorf("Could not send file %#v for message %#v: %s", file, msg, err)
+			return err
+		}
+	}
+	return nil
+}
+
 // webhookSend send one or more message via webhook, taking care of file
 // uploads (from slack, telegram or mattermost).
 // Returns messageID and error.
-func (b *Bdiscord) webhookSend(msg *config.Message, channelID string) (*discordgo.Message, error) {
+func (b *Bdiscord) webhookSend(msg *config.Message, channelID string) (string, error) {
 	var (
-		res  *discordgo.Message
-		res2 *discordgo.Message
+		res  string
 		err  error
 	)
 
@@ -61,48 +114,11 @@ func (b *Bdiscord) webhookSend(msg *config.Message, channelID string) (*discordg
 
 	// We can't send empty messages.
 	if msg.Text != "" {
-		res, err = b.transmitter.Send(
-			channelID,
-			&discordgo.WebhookParams{
-				Content:         msg.Text,
-				Username:        msg.Username,
-				AvatarURL:       msg.Avatar,
-				AllowedMentions: b.getAllowedMentions(),
-			},
-		)
-		if err != nil {
-			b.Log.Errorf("Could not send text (%s) for message %#v: %s", msg.Text, msg, err)
-		}
+		res, err = b.webhookSendTextOnly(msg, channelID)
 	}
 
-	if msg.Extra != nil {
-		for _, f := range msg.Extra["file"] {
-			fi := f.(config.FileInfo)
-			file := discordgo.File{
-				Name:        fi.Name,
-				ContentType: "",
-				Reader:      bytes.NewReader(*fi.Data),
-			}
-			content := fi.Comment
-
-			res2, err = b.transmitter.Send(
-				channelID,
-				&discordgo.WebhookParams{
-					Username:        msg.Username,
-					AvatarURL:       msg.Avatar,
-					Files:           []*discordgo.File{&file},
-					Content:         content,
-					AllowedMentions: b.getAllowedMentions(),
-				},
-			)
-			if err != nil {
-				b.Log.Errorf("Could not send file %#v for message %#v: %s", file, msg, err)
-			}
-		}
-	}
-
-	if msg.Text == "" {
-		res = res2
+	if err == nil && msg.Extra != nil {
+		err = b.webhookSendFilesOnly(msg, channelID)
 	}
 
 	return res, err
@@ -120,35 +136,44 @@ func (b *Bdiscord) handleEventWebhook(msg *config.Message, channelID string) (st
 		return "", nil
 	}
 
-	msg.Text = helper.ClipMessage(msg.Text, MessageLength, b.GetString("MessageClipped"))
-	msg.Text = b.replaceUserMentions(msg.Text)
 	// discord username must be [0..32] max
 	if len(msg.Username) > 32 {
 		msg.Username = msg.Username[0:32]
 	}
 
 	if msg.ID != "" {
+		// Exploit that a discord message ID is actually just a large number, and we encode a list of IDs by separating them with ";".
+		var msgIds = strings.Split(msg.ID, ";")
+		msgParts := helper.ClipOrSplitMessage(b.replaceUserMentions(msg.Text), MessageLength, b.GetString("MessageClipped"), len(msgIds))
+		for len(msgParts) < len(msgIds) {
+			msgParts = append(msgParts, "((obsoleted by edit))")
+		}
 		b.Log.Debugf("Editing webhook message")
-		err := b.transmitter.Edit(channelID, msg.ID, &discordgo.WebhookParams{
-			Content:         msg.Text,
-			Username:        msg.Username,
-			AllowedMentions: b.getAllowedMentions(),
-		})
-		if err == nil {
+		var edit_err error = nil
+		for i := range msgParts {
+			// In case of split-messages where some parts remain the same (i.e. only a typo-fix in a huge message), this causes some noop-updates.
+			// TODO: Optimize away noop-updates of un-edited messages
+			edit_err = b.transmitter.Edit(channelID, msgIds[i], &discordgo.WebhookParams{
+				Content:         msgParts[i],
+				Username:        msg.Username,
+				AllowedMentions: b.getAllowedMentions(),
+			})
+			if edit_err != nil {
+				break
+			}
+		}
+		if edit_err == nil {
 			return msg.ID, nil
 		}
-		b.Log.Errorf("Could not edit webhook message: %s", err)
+		b.Log.Errorf("Could not edit webhook message(s): %s; sending as new message(s) instead", edit_err)
 	}
 
 	b.Log.Debugf("Processing webhook sending for message %#v", msg)
-	discordMsg, err := b.webhookSend(msg, channelID)
+	msg.Text = b.replaceUserMentions(msg.Text)
+	msgId, err := b.webhookSend(msg, channelID)
 	if err != nil {
-		b.Log.Errorf("Could not broadcast via webhook for message %#v: %s", msg, err)
+		b.Log.Errorf("Could not broadcast via webhook for message %#v: %s", msgId, err)
 		return "", err
 	}
-	if discordMsg == nil {
-		return "", nil
-	}
-
-	return discordMsg.ID, nil
+	return msgId, nil
 }
diff --git a/bridge/helper/helper.go b/bridge/helper/helper.go
index d6488af66a..d968f4d8c4 100644
--- a/bridge/helper/helper.go
+++ b/bridge/helper/helper.go
@@ -229,6 +229,33 @@ func ClipMessage(text string, length int, clippingMessage string) string {
 	return text
 }
 
+func ClipOrSplitMessage(text string, length int, clippingMessage string, splitMax int) []string {
+	var msgParts []string
+	var remainingText = text
+	// Invariant of this splitting loop: No text is lost (msgParts+remainingText is the original text),
+	// and all parts is guaranteed to satisfy the length requirement.
+	for len(msgParts) < splitMax - 1 && len(remainingText) > length {
+		// Decision: The text needs to be split (again).
+		var chunk string
+		var wasted = 0
+		// The longest UTF-8 encoding of a valid rune is 4 bytes (0xF4 0x8F 0xBF 0xBF, encoding U+10FFFF),
+		// so we should never need to waste 4 or more bytes at a time.
+		for wasted < 4 && wasted < length {
+			chunk = remainingText[:length - wasted]
+			if r, _ := utf8.DecodeLastRuneInString(chunk); r == utf8.RuneError {
+				wasted += 1
+			} else {
+				break
+			}
+		}
+		// Note: At this point, "chunk" might still be invalid, if "text" is very broken.
+		msgParts = append(msgParts, chunk)
+		remainingText = remainingText[len(chunk):]
+	}
+	msgParts = append(msgParts, ClipMessage(remainingText, length, clippingMessage))
+	return msgParts
+}
+
 // ParseMarkdown takes in an input string as markdown and parses it to html
 func ParseMarkdown(input string) string {
 	extensions := parser.HardLineBreak | parser.NoIntraEmphasis | parser.FencedCode
diff --git a/bridge/helper/helper_test.go b/bridge/helper/helper_test.go
index f21a4bda8e..739ece9a27 100644
--- a/bridge/helper/helper_test.go
+++ b/bridge/helper/helper_test.go
@@ -134,3 +134,105 @@ func TestConvertWebPToPNG(t *testing.T) {
 		t.Fail()
 	}
 }
+
+var clippingOrSplittingTestCases = map[string]struct {
+	inputText       string
+	clipSplitLength int
+	clippingMessage string
+	splitMax        int
+	expectedOutput  []string
+}{
+	"Short single-line message, split 3": {
+		inputText:       "short",
+		clipSplitLength: 20,
+		clippingMessage: "?!?!",
+		splitMax:        3,
+		expectedOutput:  []string{"short"},
+	},
+	"Short single-line message, split 1": {
+		inputText:       "short",
+		clipSplitLength: 20,
+		clippingMessage: "?!?!",
+		splitMax:        1,
+		expectedOutput:  []string{"short"},
+	},
+	"Short single-line message, split 0": {
+		// Mainly check that we don't crash.
+		inputText:       "short",
+		clipSplitLength: 20,
+		clippingMessage: "?!?!",
+		splitMax:        0,
+		expectedOutput:  []string{"short"},
+	},
+	"Long single-line message, noclip": {
+		inputText:       "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
+		clipSplitLength: 50,
+		clippingMessage: "?!?!",
+		splitMax:        10,
+		expectedOutput:  []string{
+			"Lorem ipsum dolor sit amet, consectetur adipiscing",
+			" elit, sed do eiusmod tempor incididunt ut labore ",
+			"et dolore magna aliqua.",
+		},
+	},
+	"Long single-line message, noclip tight": {
+		inputText:       "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
+		clipSplitLength: 50,
+		clippingMessage: "?!?!",
+		splitMax:        3,
+		expectedOutput:  []string{
+			"Lorem ipsum dolor sit amet, consectetur adipiscing",
+			" elit, sed do eiusmod tempor incididunt ut labore ",
+			"et dolore magna aliqua.",
+		},
+	},
+	"Long single-line message, clip custom": {
+		inputText:       "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
+		clipSplitLength: 50,
+		clippingMessage: "?!?!",
+		splitMax:        2,
+		expectedOutput:  []string{
+			"Lorem ipsum dolor sit amet, consectetur adipiscing",
+			" elit, sed do eiusmod tempor incididunt ut lab?!?!",
+		},
+	},
+	"Long single-line message, clip built-in": {
+		inputText:       "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.",
+		clipSplitLength: 50,
+		clippingMessage: "",
+		splitMax:        2,
+		expectedOutput:  []string{
+			"Lorem ipsum dolor sit amet, consectetur adipiscing",
+			" elit, sed do eiusmod tempor inc <clipped message>",
+		},
+	},
+	"Short multi-line message": {
+		inputText:       "I\ncan't\nget\nno\nsatisfaction!",
+		clipSplitLength: 50,
+		clippingMessage: "",
+		splitMax:        2,
+		expectedOutput:  []string{"I\ncan't\nget\nno\nsatisfaction!"},
+	},
+	"Long message containing UTF-8 multi-byte runes": {
+		inputText:       "人人生而自由,在尊嚴和權利上一律平等。 他們都具有理性和良知,應該以兄弟情誼的精神對待彼此。",
+		clipSplitLength: 50,
+		clippingMessage: "",
+		splitMax:        10,
+		expectedOutput:  []string{
+			"人人生而自由,在尊嚴和權利上一律", // Note: only 48 bytes!
+			"平等。 他們都具有理性和良知,應該", // Note: only 49 bytes!
+			"以兄弟情誼的精神對待彼此。",
+		},
+	},
+}
+
+func TestClipOrSplitMessage(t *testing.T) {
+	for testname, testcase := range clippingOrSplittingTestCases {
+		actualOutput := ClipOrSplitMessage(testcase.inputText, testcase.clipSplitLength, testcase.clippingMessage, testcase.splitMax)
+		assert.Equalf(t, testcase.expectedOutput, actualOutput, "'%s' testcase should give expected lines with clipping+splitting.", testname)
+		for _, splitLine := range testcase.expectedOutput {
+			byteLength := len([]byte(splitLine))
+			assert.True(t, byteLength <= testcase.clipSplitLength, "Splitted line '%s' of testcase '%s' should not exceed the maximum byte-length (%d vs. %d).", splitLine, testname, testcase.clipSplitLength, byteLength)
+		}
+	}
+}
diff --git a/matterbridge.toml.sample b/matterbridge.toml.sample
index 5932b269a3..a3c471f5e5 100644
--- a/matterbridge.toml.sample
+++ b/matterbridge.toml.sample
@@ -925,10 +925,17 @@ ShowTopicChange=false
 # Supported from the following bridges: slack
 SyncTopic=false
 
-#Message to show when a message is too big
-#Default "<clipped message>"
+# Message to show when a message is too big
+# Default "<clipped message>"
 MessageClipped="<clipped message>"
 
+# Before clipping, try to split messages into at most this many parts. 0 is treated like 1.
+# Be careful with large numbers, as this might cause flooding.
+# Example: A maximum telegram message of 4096 bytes is received. This requires 3 Discord
+# messages (each capped at a hardcoded 1950 bytes).
+# Default 1
+MessageSplitMaxCount=3
+
 ###################################################################
 #telegram section
 ###################################################################