Allow assistant photo directives

This commit is contained in:
Codex
2026-05-21 12:28:57 +00:00
parent e9425c6d9b
commit 1a74c02173
6 changed files with 272 additions and 5 deletions

View File

@@ -0,0 +1,22 @@
---
name: telegram-photo
description: Use when Codex should send, show, or share a local picture into the Telegram chat through the bot without calling Telegram tools.
metadata:
short-description: Send Telegram photos from assistant output
---
# Telegram Photo
When asked to send/show/share a picture in Telegram, emit a photo directive in normal assistant output. The bot strips the directive and sends the image as a Telegram photo.
Use exactly one directive line per image, outside code fences:
`<!-- telegram-photo {"path":"<absolute-local-image-path>","caption":"<optional caption>"} -->`
Rules:
- Replace `<absolute-local-image-path>` with an absolute path that exists in the current workspace or another location visible to the bot process.
- Do not hardcode machine-specific directories, user names, repository paths, or sample filenames in this skill.
- Supported extensions are `.jpg`, `.jpeg`, `.png`, `.webp`, and `.gif`.
- `caption` is optional and should be short; omit the `caption` field when no caption is needed.
- Do not use external Telegram tool calls for this.
- If no usable image path is known, ask for the path or explain what local file is needed.

View File

@@ -0,0 +1,4 @@
interface:
display_name: "Telegram Photo"
short_description: "Send Telegram photos from assistant output."
default_prompt: "Send a local image to the Telegram chat using the bot photo directive."

3
.gitignore vendored
View File

@@ -24,6 +24,9 @@
*.test
coverage.out
# Local scratch assets
/codex-telegram-bot-profile.jpg
# Editor/OS noise
.DS_Store
.idea/

View File

@@ -144,6 +144,52 @@ func (c *Client) DownloadFile(ctx context.Context, filePath string) ([]byte, err
return io.ReadAll(resp.Body)
}
func (c *Client) SendPhotoBytes(ctx context.Context, chatID int64, filename string, data []byte, caption string) (Message, error) {
var body bytes.Buffer
writer := multipart.NewWriter(&body)
if err := writer.WriteField("chat_id", fmt.Sprint(chatID)); err != nil {
return Message{}, err
}
if caption != "" {
if err := writer.WriteField("caption", caption); err != nil {
return Message{}, err
}
}
part, err := writer.CreateFormFile("photo", filepath.Base(filename))
if err != nil {
return Message{}, err
}
if _, err := part.Write(data); err != nil {
return Message{}, err
}
if err := writer.Close(); err != nil {
return Message{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/sendPhoto", &body)
if err != nil {
return Message{}, err
}
req.Header.Set("Content-Type", writer.FormDataContentType())
resp, err := c.httpClient.Do(req)
if err != nil {
return Message{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
payload, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return Message{}, fmt.Errorf("sendPhoto: telegram returned %s: %s", resp.Status, string(payload))
}
var decoded apiResponse[Message]
if err := json.NewDecoder(resp.Body).Decode(&decoded); err != nil {
return Message{}, err
}
if !decoded.OK {
return Message{}, fmt.Errorf("sendPhoto: telegram error %d: %s", decoded.ErrorCode, decoded.Description)
}
return decoded.Result, nil
}
func (c *Client) SendDocumentBytes(ctx context.Context, chatID int64, filename string, data []byte, caption string) (Message, error) {
var body bytes.Buffer
writer := multipart.NewWriter(&body)

View File

@@ -20,9 +20,12 @@ import (
)
const (
telegramDownloadLimit = 20 * 1024 * 1024
resumeThreadPageSize = 8
commandSummaryLimit = 120
telegramDownloadLimit = 20 * 1024 * 1024
resumeThreadPageSize = 8
commandSummaryLimit = 120
telegramPhotoDirectiveStart = "<!-- telegram-photo "
telegramPhotoDirectiveEnd = " -->"
telegramPhotoCaptionLimit = 1024
)
type Bot struct {
@@ -41,11 +44,22 @@ type Bot struct {
diffs map[string]string
}
type assistantMessageSegment struct {
Text string
Photo *assistantPhotoDirective
}
type assistantPhotoDirective struct {
Path string `json:"path"`
Caption string `json:"caption,omitempty"`
}
type outputState struct {
chatID int64
assistant strings.Builder
sentAny bool
tools map[string]toolMessageState
sentImages map[string]bool
workingIndicatorOff context.CancelFunc
}
@@ -689,6 +703,15 @@ func (b *Bot) handleSandboxCommand(ctx context.Context, userID, chatID int64, se
return err
}
func isPicturePath(path string) bool {
switch strings.ToLower(filepath.Ext(path)) {
case ".jpg", ".jpeg", ".png", ".webp", ".gif":
return true
default:
return false
}
}
func (b *Bot) sendDiff(ctx context.Context, chatID int64, session store.Session) error {
if session.ActiveThreadID == 0 {
_, err := b.tg.SendMessage(ctx, chatID, "No active thread.", SendMessageOptions{})
@@ -1333,7 +1356,10 @@ func (b *Bot) handleCodexNotification(ctx context.Context, event codexapp.Event)
return b.flushAssistantMessage(ctx, params.ThreadID)
}
if params.ThreadID != "" {
return b.upsertToolMessage(ctx, params.ThreadID, item.ID, renderCodexItemCompleted(item))
if err := b.upsertToolMessage(ctx, params.ThreadID, item.ID, renderCodexItemCompleted(item)); err != nil {
return err
}
return b.sendImageOutput(ctx, params.ThreadID, item)
}
case "turn/diff/updated":
var params struct {
@@ -1456,6 +1482,7 @@ func (b *Bot) newOutputState(chatID int64) *outputState {
return &outputState{
chatID: chatID,
tools: make(map[string]toolMessageState),
sentImages: make(map[string]bool),
workingIndicatorOff: b.startWorkingIndicator(chatID),
}
}
@@ -1550,6 +1577,52 @@ func (b *Bot) failActiveOutputs(ctx context.Context, message string) {
}
}
func (b *Bot) sendImageOutput(ctx context.Context, threadID string, item codexThreadItemView) error {
if item.Type != "imageGeneration" || strings.TrimSpace(item.SavedPath) == "" {
return nil
}
path := strings.TrimSpace(item.SavedPath)
if !b.markImageOutputPending(threadID, path) {
return nil
}
data, err := os.ReadFile(path)
if err != nil {
b.logger.Printf("read generated image %s: %v", path, err)
return nil
}
chatID, err := b.outputChatID(ctx, threadID)
if err != nil {
return nil
}
caption := "Generated image"
if item.Status != "" {
caption += ": " + item.Status
}
if _, err := b.tg.SendPhotoBytes(ctx, chatID, path, data, caption); err != nil {
b.logger.Printf("send generated image %s: %v", path, err)
return nil
}
b.markOutputSent(threadID)
return nil
}
func (b *Bot) markImageOutputPending(threadID, path string) bool {
b.mu.Lock()
defer b.mu.Unlock()
state := b.outputs[threadID]
if state == nil {
return false
}
if state.sentImages == nil {
state.sentImages = make(map[string]bool)
}
if state.sentImages[path] {
return false
}
state.sentImages[path] = true
return true
}
func (b *Bot) sendOutputBlock(ctx context.Context, threadID, block string) error {
block = strings.TrimSpace(block)
if block == "" {
@@ -1764,6 +1837,100 @@ func ignoreTelegramMessageNotModified(err error) error {
}
return err
}
func splitAssistantMessageSegments(text string) []assistantMessageSegment {
var segments []assistantMessageSegment
var visible strings.Builder
flushVisible := func() {
if visible.Len() == 0 {
return
}
segments = append(segments, assistantMessageSegment{Text: visible.String()})
visible.Reset()
}
for _, line := range strings.SplitAfter(text, "\n") {
body := strings.TrimSuffix(line, "\n")
body = strings.TrimSuffix(body, "\r")
if directive, ok := parseAssistantPhotoDirectiveLine(body); ok {
flushVisible()
segments = append(segments, assistantMessageSegment{Photo: &directive})
continue
}
visible.WriteString(line)
}
flushVisible()
return segments
}
func parseAssistantPhotoDirectiveLine(line string) (assistantPhotoDirective, bool) {
trimmed := strings.TrimSpace(line)
if !strings.HasPrefix(trimmed, telegramPhotoDirectiveStart) || !strings.HasSuffix(trimmed, telegramPhotoDirectiveEnd) {
return assistantPhotoDirective{}, false
}
raw := strings.TrimSuffix(strings.TrimPrefix(trimmed, telegramPhotoDirectiveStart), telegramPhotoDirectiveEnd)
raw = strings.TrimSpace(raw)
var directive assistantPhotoDirective
if err := json.Unmarshal([]byte(raw), &directive); err != nil {
return assistantPhotoDirective{}, false
}
directive.Path = strings.TrimSpace(directive.Path)
directive.Caption = strings.TrimSpace(directive.Caption)
return directive, true
}
func (b *Bot) sendAssistantText(ctx context.Context, chatID int64, text string) error {
for _, segment := range splitAssistantMessageSegments(text) {
if segment.Text != "" && strings.TrimSpace(segment.Text) != "" {
if err := b.sendLong(ctx, chatID, segment.Text); err != nil {
return err
}
}
if segment.Photo != nil {
if err := b.sendAssistantPhoto(ctx, chatID, *segment.Photo); err != nil {
b.logger.Printf("send assistant photo: %v", err)
if sendErr := b.sendLong(ctx, chatID, "Could not send photo: "+err.Error()); sendErr != nil {
return sendErr
}
}
}
}
return nil
}
func (b *Bot) sendAssistantPhoto(ctx context.Context, chatID int64, directive assistantPhotoDirective) error {
path := strings.TrimSpace(directive.Path)
if path == "" {
return errors.New("photo directive is missing a path")
}
if !filepath.IsAbs(path) {
return fmt.Errorf("photo path must be absolute: %s", path)
}
if !isPicturePath(path) {
return fmt.Errorf("unsupported photo type: %s", filepath.Base(path))
}
data, err := os.ReadFile(path)
if err != nil {
return fmt.Errorf("read %s: %v", filepath.Base(path), err)
}
caption := truncateTelegramPhotoCaption(directive.Caption)
if _, err := b.tg.SendPhotoBytes(ctx, chatID, path, data, caption); err != nil {
return fmt.Errorf("send %s: %v", filepath.Base(path), err)
}
return nil
}
func truncateTelegramPhotoCaption(caption string) string {
runes := []rune(caption)
if len(runes) <= telegramPhotoCaptionLimit {
return caption
}
if telegramPhotoCaptionLimit <= 3 {
return string(runes[:telegramPhotoCaptionLimit])
}
return string(runes[:telegramPhotoCaptionLimit-3]) + "..."
}
func (b *Bot) appendAssistantDelta(ctx context.Context, threadID, delta string) error {
if delta == "" {
return nil
@@ -1792,7 +1959,7 @@ func (b *Bot) flushAssistantMessage(ctx context.Context, threadID string) error
state.assistant.Reset()
b.mu.Unlock()
if err := b.sendLong(ctx, chatID, text); err != nil {
if err := b.sendAssistantText(ctx, chatID, text); err != nil {
return err
}
b.markOutputSent(threadID)

View File

@@ -77,6 +77,31 @@ func TestParseCommand(t *testing.T) {
}
}
func TestSplitAssistantMessageSegmentsWithPhotoDirective(t *testing.T) {
text := "before\n<!-- telegram-photo {\"path\":\"/tmp/photo.jpg\",\"caption\":\"hello\"} -->\nafter"
segments := splitAssistantMessageSegments(text)
if len(segments) != 3 {
t.Fatalf("segments = %d, want 3: %#v", len(segments), segments)
}
if segments[0].Text != "before\n" || segments[0].Photo != nil {
t.Fatalf("unexpected first segment: %#v", segments[0])
}
if segments[1].Photo == nil || segments[1].Photo.Path != "/tmp/photo.jpg" || segments[1].Photo.Caption != "hello" {
t.Fatalf("unexpected photo segment: %#v", segments[1])
}
if segments[2].Text != "after" || segments[2].Photo != nil {
t.Fatalf("unexpected final segment: %#v", segments[2])
}
}
func TestInvalidPhotoDirectiveStaysVisible(t *testing.T) {
text := "<!-- telegram-photo not-json -->"
segments := splitAssistantMessageSegments(text)
if len(segments) != 1 || segments[0].Text != text {
t.Fatalf("invalid directive should stay text: %#v", segments)
}
}
func TestRenderCodexCommandExecutionItem(t *testing.T) {
output := "line 1\nline 2"
exitCode := 0