From 1a74c02173b35df9e893c307d556d071efc67988 Mon Sep 17 00:00:00 2001 From: Codex Date: Thu, 21 May 2026 12:28:57 +0000 Subject: [PATCH] Allow assistant photo directives --- .codex/skills/telegram-photo/SKILL.md | 22 +++ .../skills/telegram-photo/agents/openai.yaml | 4 + .gitignore | 3 + internal/telegram/api.go | 46 +++++ internal/telegram/bot.go | 177 +++++++++++++++++- internal/telegram/render_test.go | 25 +++ 6 files changed, 272 insertions(+), 5 deletions(-) create mode 100644 .codex/skills/telegram-photo/SKILL.md create mode 100644 .codex/skills/telegram-photo/agents/openai.yaml diff --git a/.codex/skills/telegram-photo/SKILL.md b/.codex/skills/telegram-photo/SKILL.md new file mode 100644 index 0000000..8eda7c5 --- /dev/null +++ b/.codex/skills/telegram-photo/SKILL.md @@ -0,0 +1,22 @@ +--- +name: telegram-photo +description: Use when Codex should send, show, or share a local picture into the Telegram chat through the bot without calling Telegram tools. +metadata: + short-description: Send Telegram photos from assistant output +--- + +# Telegram Photo + +When asked to send/show/share a picture in Telegram, emit a photo directive in normal assistant output. The bot strips the directive and sends the image as a Telegram photo. + +Use exactly one directive line per image, outside code fences: + +`` + +Rules: +- Replace `` with an absolute path that exists in the current workspace or another location visible to the bot process. +- Do not hardcode machine-specific directories, user names, repository paths, or sample filenames in this skill. +- Supported extensions are `.jpg`, `.jpeg`, `.png`, `.webp`, and `.gif`. +- `caption` is optional and should be short; omit the `caption` field when no caption is needed. +- Do not use external Telegram tool calls for this. +- If no usable image path is known, ask for the path or explain what local file is needed. diff --git a/.codex/skills/telegram-photo/agents/openai.yaml b/.codex/skills/telegram-photo/agents/openai.yaml new file mode 100644 index 0000000..b088fdc --- /dev/null +++ b/.codex/skills/telegram-photo/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Telegram Photo" + short_description: "Send Telegram photos from assistant output." + default_prompt: "Send a local image to the Telegram chat using the bot photo directive." diff --git a/.gitignore b/.gitignore index 90f9ed7..0a5d688 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,9 @@ *.test coverage.out +# Local scratch assets +/codex-telegram-bot-profile.jpg + # Editor/OS noise .DS_Store .idea/ diff --git a/internal/telegram/api.go b/internal/telegram/api.go index f73d0de..5351860 100644 --- a/internal/telegram/api.go +++ b/internal/telegram/api.go @@ -144,6 +144,52 @@ func (c *Client) DownloadFile(ctx context.Context, filePath string) ([]byte, err return io.ReadAll(resp.Body) } +func (c *Client) SendPhotoBytes(ctx context.Context, chatID int64, filename string, data []byte, caption string) (Message, error) { + var body bytes.Buffer + writer := multipart.NewWriter(&body) + if err := writer.WriteField("chat_id", fmt.Sprint(chatID)); err != nil { + return Message{}, err + } + if caption != "" { + if err := writer.WriteField("caption", caption); err != nil { + return Message{}, err + } + } + part, err := writer.CreateFormFile("photo", filepath.Base(filename)) + if err != nil { + return Message{}, err + } + if _, err := part.Write(data); err != nil { + return Message{}, err + } + if err := writer.Close(); err != nil { + return Message{}, err + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/sendPhoto", &body) + if err != nil { + return Message{}, err + } + req.Header.Set("Content-Type", writer.FormDataContentType()) + resp, err := c.httpClient.Do(req) + if err != nil { + return Message{}, err + } + defer resp.Body.Close() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + payload, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + return Message{}, fmt.Errorf("sendPhoto: telegram returned %s: %s", resp.Status, string(payload)) + } + var decoded apiResponse[Message] + if err := json.NewDecoder(resp.Body).Decode(&decoded); err != nil { + return Message{}, err + } + if !decoded.OK { + return Message{}, fmt.Errorf("sendPhoto: telegram error %d: %s", decoded.ErrorCode, decoded.Description) + } + return decoded.Result, nil +} + func (c *Client) SendDocumentBytes(ctx context.Context, chatID int64, filename string, data []byte, caption string) (Message, error) { var body bytes.Buffer writer := multipart.NewWriter(&body) diff --git a/internal/telegram/bot.go b/internal/telegram/bot.go index 8744f8b..c16b603 100644 --- a/internal/telegram/bot.go +++ b/internal/telegram/bot.go @@ -20,9 +20,12 @@ import ( ) const ( - telegramDownloadLimit = 20 * 1024 * 1024 - resumeThreadPageSize = 8 - commandSummaryLimit = 120 + telegramDownloadLimit = 20 * 1024 * 1024 + resumeThreadPageSize = 8 + commandSummaryLimit = 120 + telegramPhotoDirectiveStart = "" + telegramPhotoCaptionLimit = 1024 ) type Bot struct { @@ -41,11 +44,22 @@ type Bot struct { diffs map[string]string } +type assistantMessageSegment struct { + Text string + Photo *assistantPhotoDirective +} + +type assistantPhotoDirective struct { + Path string `json:"path"` + Caption string `json:"caption,omitempty"` +} + type outputState struct { chatID int64 assistant strings.Builder sentAny bool tools map[string]toolMessageState + sentImages map[string]bool workingIndicatorOff context.CancelFunc } @@ -689,6 +703,15 @@ func (b *Bot) handleSandboxCommand(ctx context.Context, userID, chatID int64, se return err } +func isPicturePath(path string) bool { + switch strings.ToLower(filepath.Ext(path)) { + case ".jpg", ".jpeg", ".png", ".webp", ".gif": + return true + default: + return false + } +} + func (b *Bot) sendDiff(ctx context.Context, chatID int64, session store.Session) error { if session.ActiveThreadID == 0 { _, err := b.tg.SendMessage(ctx, chatID, "No active thread.", SendMessageOptions{}) @@ -1333,7 +1356,10 @@ func (b *Bot) handleCodexNotification(ctx context.Context, event codexapp.Event) return b.flushAssistantMessage(ctx, params.ThreadID) } if params.ThreadID != "" { - return b.upsertToolMessage(ctx, params.ThreadID, item.ID, renderCodexItemCompleted(item)) + if err := b.upsertToolMessage(ctx, params.ThreadID, item.ID, renderCodexItemCompleted(item)); err != nil { + return err + } + return b.sendImageOutput(ctx, params.ThreadID, item) } case "turn/diff/updated": var params struct { @@ -1456,6 +1482,7 @@ func (b *Bot) newOutputState(chatID int64) *outputState { return &outputState{ chatID: chatID, tools: make(map[string]toolMessageState), + sentImages: make(map[string]bool), workingIndicatorOff: b.startWorkingIndicator(chatID), } } @@ -1550,6 +1577,52 @@ func (b *Bot) failActiveOutputs(ctx context.Context, message string) { } } +func (b *Bot) sendImageOutput(ctx context.Context, threadID string, item codexThreadItemView) error { + if item.Type != "imageGeneration" || strings.TrimSpace(item.SavedPath) == "" { + return nil + } + path := strings.TrimSpace(item.SavedPath) + if !b.markImageOutputPending(threadID, path) { + return nil + } + data, err := os.ReadFile(path) + if err != nil { + b.logger.Printf("read generated image %s: %v", path, err) + return nil + } + chatID, err := b.outputChatID(ctx, threadID) + if err != nil { + return nil + } + caption := "Generated image" + if item.Status != "" { + caption += ": " + item.Status + } + if _, err := b.tg.SendPhotoBytes(ctx, chatID, path, data, caption); err != nil { + b.logger.Printf("send generated image %s: %v", path, err) + return nil + } + b.markOutputSent(threadID) + return nil +} + +func (b *Bot) markImageOutputPending(threadID, path string) bool { + b.mu.Lock() + defer b.mu.Unlock() + state := b.outputs[threadID] + if state == nil { + return false + } + if state.sentImages == nil { + state.sentImages = make(map[string]bool) + } + if state.sentImages[path] { + return false + } + state.sentImages[path] = true + return true +} + func (b *Bot) sendOutputBlock(ctx context.Context, threadID, block string) error { block = strings.TrimSpace(block) if block == "" { @@ -1764,6 +1837,100 @@ func ignoreTelegramMessageNotModified(err error) error { } return err } + +func splitAssistantMessageSegments(text string) []assistantMessageSegment { + var segments []assistantMessageSegment + var visible strings.Builder + flushVisible := func() { + if visible.Len() == 0 { + return + } + segments = append(segments, assistantMessageSegment{Text: visible.String()}) + visible.Reset() + } + + for _, line := range strings.SplitAfter(text, "\n") { + body := strings.TrimSuffix(line, "\n") + body = strings.TrimSuffix(body, "\r") + if directive, ok := parseAssistantPhotoDirectiveLine(body); ok { + flushVisible() + segments = append(segments, assistantMessageSegment{Photo: &directive}) + continue + } + visible.WriteString(line) + } + flushVisible() + return segments +} + +func parseAssistantPhotoDirectiveLine(line string) (assistantPhotoDirective, bool) { + trimmed := strings.TrimSpace(line) + if !strings.HasPrefix(trimmed, telegramPhotoDirectiveStart) || !strings.HasSuffix(trimmed, telegramPhotoDirectiveEnd) { + return assistantPhotoDirective{}, false + } + raw := strings.TrimSuffix(strings.TrimPrefix(trimmed, telegramPhotoDirectiveStart), telegramPhotoDirectiveEnd) + raw = strings.TrimSpace(raw) + var directive assistantPhotoDirective + if err := json.Unmarshal([]byte(raw), &directive); err != nil { + return assistantPhotoDirective{}, false + } + directive.Path = strings.TrimSpace(directive.Path) + directive.Caption = strings.TrimSpace(directive.Caption) + return directive, true +} + +func (b *Bot) sendAssistantText(ctx context.Context, chatID int64, text string) error { + for _, segment := range splitAssistantMessageSegments(text) { + if segment.Text != "" && strings.TrimSpace(segment.Text) != "" { + if err := b.sendLong(ctx, chatID, segment.Text); err != nil { + return err + } + } + if segment.Photo != nil { + if err := b.sendAssistantPhoto(ctx, chatID, *segment.Photo); err != nil { + b.logger.Printf("send assistant photo: %v", err) + if sendErr := b.sendLong(ctx, chatID, "Could not send photo: "+err.Error()); sendErr != nil { + return sendErr + } + } + } + } + return nil +} + +func (b *Bot) sendAssistantPhoto(ctx context.Context, chatID int64, directive assistantPhotoDirective) error { + path := strings.TrimSpace(directive.Path) + if path == "" { + return errors.New("photo directive is missing a path") + } + if !filepath.IsAbs(path) { + return fmt.Errorf("photo path must be absolute: %s", path) + } + if !isPicturePath(path) { + return fmt.Errorf("unsupported photo type: %s", filepath.Base(path)) + } + data, err := os.ReadFile(path) + if err != nil { + return fmt.Errorf("read %s: %v", filepath.Base(path), err) + } + caption := truncateTelegramPhotoCaption(directive.Caption) + if _, err := b.tg.SendPhotoBytes(ctx, chatID, path, data, caption); err != nil { + return fmt.Errorf("send %s: %v", filepath.Base(path), err) + } + return nil +} + +func truncateTelegramPhotoCaption(caption string) string { + runes := []rune(caption) + if len(runes) <= telegramPhotoCaptionLimit { + return caption + } + if telegramPhotoCaptionLimit <= 3 { + return string(runes[:telegramPhotoCaptionLimit]) + } + return string(runes[:telegramPhotoCaptionLimit-3]) + "..." +} + func (b *Bot) appendAssistantDelta(ctx context.Context, threadID, delta string) error { if delta == "" { return nil @@ -1792,7 +1959,7 @@ func (b *Bot) flushAssistantMessage(ctx context.Context, threadID string) error state.assistant.Reset() b.mu.Unlock() - if err := b.sendLong(ctx, chatID, text); err != nil { + if err := b.sendAssistantText(ctx, chatID, text); err != nil { return err } b.markOutputSent(threadID) diff --git a/internal/telegram/render_test.go b/internal/telegram/render_test.go index 31a56a6..b5fd79d 100644 --- a/internal/telegram/render_test.go +++ b/internal/telegram/render_test.go @@ -77,6 +77,31 @@ func TestParseCommand(t *testing.T) { } } +func TestSplitAssistantMessageSegmentsWithPhotoDirective(t *testing.T) { + text := "before\n\nafter" + segments := splitAssistantMessageSegments(text) + if len(segments) != 3 { + t.Fatalf("segments = %d, want 3: %#v", len(segments), segments) + } + if segments[0].Text != "before\n" || segments[0].Photo != nil { + t.Fatalf("unexpected first segment: %#v", segments[0]) + } + if segments[1].Photo == nil || segments[1].Photo.Path != "/tmp/photo.jpg" || segments[1].Photo.Caption != "hello" { + t.Fatalf("unexpected photo segment: %#v", segments[1]) + } + if segments[2].Text != "after" || segments[2].Photo != nil { + t.Fatalf("unexpected final segment: %#v", segments[2]) + } +} + +func TestInvalidPhotoDirectiveStaysVisible(t *testing.T) { + text := "" + segments := splitAssistantMessageSegments(text) + if len(segments) != 1 || segments[0].Text != text { + t.Fatalf("invalid directive should stay text: %#v", segments) + } +} + func TestRenderCodexCommandExecutionItem(t *testing.T) { output := "line 1\nline 2" exitCode := 0