Source code for jarkdown.markdown_converter

"""Converter for transforming Jira issues to Markdown format."""

import re
import yaml
from urllib.parse import quote
from markdownify import markdownify as md



[docs]
class MarkdownConverter:
    """Converts Jira issue data into Markdown format."""


[docs]
    def __init__(self, base_url, domain):
        """Initialize the markdown converter.

        Args:
            base_url: Base URL of the Jira instance (e.g., 'https://company.atlassian.net')
            domain: Jira domain (e.g., 'company.atlassian.net')
        """
        self.base_url = base_url
        self.domain = domain



[docs]
    def convert_html_to_markdown(self, html_content):
        """Convert HTML content to Markdown.

        Args:
            html_content: HTML string to convert

        Returns:
            str: Converted markdown content
        """
        if not html_content:
            return ""

        # Convert HTML to Markdown using markdownify
        markdown = md(html_content, heading_style="ATX", bullets="*-+")

        # Clean up any residual HTML tags that weren't converted
        markdown = re.sub(r"<[^>]+>", "", markdown)

        # Clean up excessive whitespace
        markdown = re.sub(r"\n{3,}", "\n\n", markdown)

        return markdown.strip()



[docs]
    def replace_attachment_links(self, markdown_content, downloaded_attachments):
        """Replace Jira attachment URLs with local file references.

        Args:
            markdown_content: Markdown content with Jira attachment URLs
            downloaded_attachments: List of downloaded attachment info

        Returns:
            str: Markdown content with local file references
        """
        if not downloaded_attachments:
            return markdown_content

        # Escape special regex characters in domain
        escaped_domain = re.escape(self.domain)

        # For secure attachment URLs with filename in path
        for attachment in downloaded_attachments:
            filename = attachment["filename"]
            original_filename = attachment["original_filename"]

            # URL encode filename for markdown links
            encoded_filename = quote(filename, safe="")

            if original_filename:
                # Escape the original filename for regex
                escaped_original = re.escape(original_filename)
                # Also try URL-encoded version of the filename
                encoded_original = re.escape(quote(original_filename, safe=""))
                # Pattern for secure URLs with this specific filename (regular or URL-encoded)
                patterns_to_try = [
                    f"https?://{escaped_domain}/secure/attachment/[0-9]+/{escaped_original}",
                    f"https?://{escaped_domain}/secure/attachment/[0-9]+/{encoded_original}",
                ]

                for pattern in patterns_to_try:
                    # Replace in images: ![alt](url) -> ![alt](filename)
                    markdown_content = re.sub(
                        f"(!\\[[^\\]]*\\])\\({pattern}\\)",
                        f"\\1({encoded_filename})",
                        markdown_content,
                    )
                    # Replace in links: [text](url) -> [text](filename)
                    markdown_content = re.sub(
                        f"(\\[[^\\]]+\\])\\({pattern}\\)",
                        f"\\1({encoded_filename})",
                        markdown_content,
                    )

        # For generic attachment content URLs (without filename in path)
        # Replace all remaining Jira attachment URLs with placeholder
        # This is a fallback for URLs that don't have the filename in them
        patterns = [
            f"https?://{escaped_domain}/jira/rest/api/[0-9]/attachment/content/[0-9]+",
            f"https?://{escaped_domain}/rest/api/[0-9]/attachment/content/[0-9]+",
            f"https?://{escaped_domain}/jira/rest/api/[0-9]/attachment/thumbnail/[0-9]+",
        ]

        for pattern in patterns:
            # For any remaining attachment URLs, try to infer from context
            # Look for patterns like ![filename](url) and use the filename from the alt text
            markdown_content = re.sub(
                f"!\\[([^\\]]*)\\]\\({pattern}\\)",
                lambda m: f"![{m.group(1)}]({quote(m.group(1), safe='')})"
                if m.group(1)
                else m.group(0),
                markdown_content,
            )
            # For links, keep the link text but replace URL
            markdown_content = re.sub(
                f"\\[([^\\]]+)\\]\\({pattern}\\)",
                lambda m: f"[{m.group(1)}]({quote(m.group(1), safe='')})"
                if m.group(1)
                else m.group(0),
                markdown_content,
            )

        return markdown_content


    def _parse_adf_to_markdown(self, adf_content):
        """Parse Atlassian Document Format to Markdown.

        Args:
            adf_content: ADF structure (dict) or string content

        Returns:
            str: Converted markdown text
        """
        if isinstance(adf_content, str):
            # If it's just a string, return it as-is
            return adf_content

        if not isinstance(adf_content, dict):
            return ""

        doc_type = adf_content.get("type", "")

        # Handle different node types
        if doc_type == "doc":
            # Document root - process all content nodes
            content = adf_content.get("content", [])
            return "\n\n".join(self._parse_adf_to_markdown(node) for node in content)

        elif doc_type == "paragraph":
            # Paragraph - process inline content
            content = adf_content.get("content", [])
            if not content:
                return ""
            return "".join(self._parse_adf_to_markdown(node) for node in content)

        elif doc_type == "text":
            # Text node - apply marks if any
            text = adf_content.get("text", "")
            marks = adf_content.get("marks", [])

            for mark in marks:
                mark_type = mark.get("type", "")
                if mark_type == "strong":
                    text = f"**{text}**"
                elif mark_type == "em":
                    text = f"*{text}*"
                elif mark_type == "code":
                    text = f"`{text}`"
                elif mark_type == "link":
                    href = mark.get("attrs", {}).get("href", "")
                    text = f"[{text}]({href})"

            return text

        elif doc_type == "bulletList":
            # Bullet list
            content = adf_content.get("content", [])
            items = []
            for item in content:
                item_text = self._parse_adf_to_markdown(item)
                # Add bullet point prefix
                for line in item_text.split("\n"):
                    if line:
                        items.append(f"- {line}")
            return "\n".join(items)

        elif doc_type == "orderedList":
            # Ordered list
            content = adf_content.get("content", [])
            items = []
            for i, item in enumerate(content, 1):
                item_text = self._parse_adf_to_markdown(item)
                # Add number prefix
                for j, line in enumerate(item_text.split("\n")):
                    if line:
                        if j == 0:
                            items.append(f"{i}. {line}")
                        else:
                            items.append(f"   {line}")
            return "\n".join(items)

        elif doc_type == "listItem":
            # List item - process content
            content = adf_content.get("content", [])
            return "\n".join(self._parse_adf_to_markdown(node) for node in content)

        elif doc_type == "heading":
            # Heading
            level = adf_content.get("attrs", {}).get("level", 1)
            content = adf_content.get("content", [])
            text = "".join(self._parse_adf_to_markdown(node) for node in content)
            return f"{'#' * level} {text}"

        elif doc_type == "codeBlock":
            # Code block
            content = adf_content.get("content", [])
            code = "\n".join(self._parse_adf_to_markdown(node) for node in content)
            language = adf_content.get("attrs", {}).get("language", "")
            return f"```{language}\n{code}\n```"

        elif doc_type == "blockquote":
            # Blockquote
            content = adf_content.get("content", [])
            quote_text = "\n".join(
                self._parse_adf_to_markdown(node) for node in content
            )
            # Add > prefix to each line
            return "\n".join(f"> {line}" for line in quote_text.split("\n"))

        elif doc_type == "mediaSingle" or doc_type == "media":
            # Media/attachment
            attrs = adf_content.get("attrs", {})
            # Try to get filename or alt text
            filename = attrs.get("alt", "") or attrs.get("title", "") or "attachment"
            # For now, just create a placeholder that will be replaced later
            return f"![{filename}](attachment)"

        elif doc_type == "mention":
            # User mention
            attrs = adf_content.get("attrs", {})
            text = attrs.get("text", "") or attrs.get("id", "@user")
            return f"@{text}"

        elif doc_type == "hardBreak":
            return "\n"

        else:
            # Unknown type - try to process content if it exists
            content = adf_content.get("content", [])
            if content:
                return "\n".join(self._parse_adf_to_markdown(node) for node in content)
            return ""

    def _compose_comments_section(self, issue_data, downloaded_attachments):
        """Compose the comments section of the markdown.

        Args:
            issue_data: Raw issue data from Jira API
            downloaded_attachments: List of downloaded attachment info

        Returns:
            list: Lines of markdown content for the comments section, or empty list if no comments
        """
        fields = issue_data.get("fields", {})
        comment_data = fields.get("comment", {})
        comments = comment_data.get("comments", [])

        if not comments:
            return []

        lines = []
        lines.append("## Comments")
        lines.append("")

        for i, comment in enumerate(comments):
            # Extract author and date
            author = comment.get("author", {}).get("displayName", "Unknown")
            created = comment.get("created", "")

            # Format the date (ISO 8601 to readable format)
            if created:
                # Parse and format: '2025-08-16T10:30:00.000+0000' -> '2025-08-16 10:30 AM'
                from datetime import datetime

                try:
                    # Handle various ISO formats
                    if created.endswith("Z"):
                        created = created[:-1] + "+00:00"
                    elif "+" in created and not created.endswith("+00:00"):
                        # Replace +0000 with +00:00
                        created = created.replace("+0000", "+00:00")

                    dt = datetime.fromisoformat(created)
                    formatted_date = dt.strftime("%Y-%m-%d %I:%M %p")
                except Exception:
                    formatted_date = created
            else:
                formatted_date = "Unknown date"

            # Format the comment header
            lines.append(f"**{author}** - _{formatted_date}_")
            lines.append("")

            # Process the comment body
            # Check if we have rendered HTML first
            body_html = comment.get("renderedBody", "")

            if body_html:
                # Use rendered HTML if available
                body_md = self.convert_html_to_markdown(body_html)
            else:
                # Check for ADF body structure
                body = comment.get("body")
                if isinstance(body, dict):
                    # Parse ADF structure
                    body_md = self._parse_adf_to_markdown(body)
                elif isinstance(body, str) and body:
                    # Plain text body
                    body_md = body
                else:
                    body_md = "*No comment body*"

            # Replace attachment links in the comment
            body_md = self.replace_attachment_links(body_md, downloaded_attachments)

            # Add the comment body
            lines.append(body_md)

            # Add separator between comments (except after the last one)
            if i < len(comments) - 1:
                lines.append("")
                lines.append("---")
                lines.append("")

        lines.append("")  # Add final spacing

        return lines

    def _generate_metadata_dict(self, issue_data):
        """Generate metadata dictionary from Jira issue data.

        Args:
            issue_data: Raw issue data from Jira API

        Returns:
            dict: Metadata dictionary for YAML frontmatter
        """
        fields = issue_data.get("fields", {})
        metadata = {}

        # Required fields
        metadata["key"] = issue_data.get("key", "UNKNOWN")
        metadata["summary"] = fields.get("summary", "No Summary")

        # Type and status
        if fields.get("issuetype"):
            metadata["type"] = fields["issuetype"].get("name")
        if fields.get("status"):
            metadata["status"] = fields["status"].get("name")

        # Priority
        if fields.get("priority"):
            metadata["priority"] = fields["priority"].get("name")

        # Resolution
        if fields.get("resolution"):
            metadata["resolution"] = fields["resolution"].get("name")

        # People
        if fields.get("assignee"):
            metadata["assignee"] = fields["assignee"].get("displayName")
        if fields.get("reporter"):
            metadata["reporter"] = fields["reporter"].get("displayName")
        if fields.get("creator"):
            metadata["creator"] = fields["creator"].get("displayName")

        # Labels
        if fields.get("labels"):
            metadata["labels"] = fields["labels"]

        # Components
        if fields.get("components"):
            metadata["components"] = [
                comp.get("name") for comp in fields["components"] if comp.get("name")
            ]

        # Parent issue (for sub-tasks)
        if fields.get("parent"):
            metadata["parent_key"] = fields["parent"].get("key")
            if fields["parent"].get("fields", {}).get("summary"):
                metadata["parent_summary"] = fields["parent"]["fields"]["summary"]

        # Versions
        if fields.get("versions"):
            metadata["affects_versions"] = [
                ver.get("name") for ver in fields["versions"] if ver.get("name")
            ]
        if fields.get("fixVersions"):
            metadata["fix_versions"] = [
                ver.get("name") for ver in fields["fixVersions"] if ver.get("name")
            ]

        # Dates
        if fields.get("created"):
            metadata["created_at"] = fields["created"]
        if fields.get("updated"):
            metadata["updated_at"] = fields["updated"]
        if fields.get("resolutiondate"):
            metadata["resolved_at"] = fields["resolutiondate"]

        # Remove None values and empty lists
        return {
            k: v
            for k, v in metadata.items()
            if v is not None and (not isinstance(v, list) or v)
        }


[docs]
    def compose_markdown(self, issue_data, downloaded_attachments):
        """Compose the final markdown file content.

        Args:
            issue_data: Raw issue data from Jira API
            downloaded_attachments: List of downloaded attachment info

        Returns:
            str: Complete markdown content for the issue
        """
        fields = issue_data.get("fields", {})
        rendered_fields = issue_data.get("renderedFields", {})

        # Generate metadata dictionary
        metadata = self._generate_metadata_dict(issue_data)

        # Extract key and summary for the title
        key = metadata.get("key", "UNKNOWN")
        summary = metadata.get("summary", "No Summary")

        # Start composing markdown
        lines = []

        # YAML frontmatter
        yaml_content = yaml.dump(
            metadata, default_flow_style=False, allow_unicode=True, sort_keys=False
        )
        lines.append("---")
        lines.append(yaml_content.rstrip())
        lines.append("---")
        lines.append("")

        # Title with link to Jira issue
        lines.append(f"# [{key}]({self.base_url}/browse/{key}): {summary}")
        lines.append("")

        # Description section
        lines.append("## Description")
        lines.append("")

        # Convert description from HTML to Markdown
        description_html = rendered_fields.get("description", "")
        if not description_html and fields.get("description"):
            # If no rendered HTML, try to use raw description
            # (This would need ADF to Markdown conversion for full support)
            description_html = f"<p>{fields.get('description')}</p>"

        if description_html:
            description_md = self.convert_html_to_markdown(description_html)
            # Replace attachment links
            description_md = self.replace_attachment_links(
                description_md, downloaded_attachments
            )
            lines.append(description_md)
        else:
            lines.append("*No description provided*")

        lines.append("")

        # Comments section (after description, before attachments)
        comment_lines = self._compose_comments_section(
            issue_data, downloaded_attachments
        )
        if comment_lines:
            lines.extend(comment_lines)

        # Attachments section
        if downloaded_attachments:
            lines.append("## Attachments")
            lines.append("")
            for attachment in downloaded_attachments:
                filename = attachment["filename"]
                mime_type = attachment["mime_type"]
                encoded_filename = quote(filename, safe="")

                # Check if it's an image
                if mime_type and mime_type.startswith("image/"):
                    # Embed images
                    lines.append(f"- ![{filename}]({encoded_filename})")
                else:
                    # Link other files
                    lines.append(f"- [{filename}]({encoded_filename})")
            lines.append("")

        return "\n".join(lines)