import streamlit as st
import fitz  # PyMuPDF
import re
import os
import tempfile
import zipfile
from io import BytesIO

class StreamlitMmpiExtractor:
    def __init__(self):
        self.results = {}

    def extract_name(self, page):
        try:
            text = page.get_text("text")
            match = re.search(r"(?:이름|성명)\s*:\s*(\S+)", text)
            if match: 
                return match.group(1)
            blocks = page.get_text("blocks")
            if blocks: 
                return blocks[0][4].strip().split('\n')[0]
        except Exception: 
            pass
        return "이름모름"

    def reformat_numeric_blocks(self, text):
        lines = text.split('\n')
        if not lines: 
            return ""
        new_lines = [lines[0]]
        for i in range(1, len(lines)):
            line, stripped_line = lines[i], lines[i].strip()
            is_numeric = False
            if stripped_line and ' ' not in stripped_line:
                try: 
                    float(stripped_line)
                    is_numeric = True
                except ValueError: 
                    is_numeric = False
            if is_numeric and new_lines and new_lines[-1].strip():
                new_lines[-1] += f" {stripped_line}"
            else:
                new_lines.append(line)
        return "\n".join(new_lines)

    def process_files(self, uploaded_files):
        results = {}
        pages_to_extract = [1, 2, 3, 4]
        
        progress_bar = st.progress(0)
        status_text = st.empty()
        total_files = len(uploaded_files)
        
        for i, uploaded_file in enumerate(uploaded_files):
            try:
                status_text.text(f'처리 중: {uploaded_file.name} ({i+1}/{total_files})')
                
                # 임시 파일로 저장
                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                    tmp_file.write(uploaded_file.getvalue())
                    tmp_file_path = tmp_file.name
                
                doc = fitz.open(tmp_file_path)
                if doc.page_count < 5:
                    st.warning(f"{uploaded_file.name}: 페이지 수가 부족합니다 (최소 5페이지 필요)")
                    os.unlink(tmp_file_path)
                    continue
                    
                name = self.extract_name(doc.load_page(0))
                if name in results:
                    name = f"{name}_{i+1}"
                
                # 모든 페이지 텍스트 합치기
                combined_raw_text = ""
                for page_num in pages_to_extract:
                    if page_num < doc.page_count:
                        combined_raw_text += doc.load_page(page_num).get_text("text") + "\n"
                
                # 중복 라인 제거
                lines = combined_raw_text.split('\n')
                unique_lines = []
                seen_lines = set()
                for line in lines:
                    stripped_line = line.strip()
                    if stripped_line and stripped_line not in seen_lines:
                        seen_lines.add(stripped_line)
                        unique_lines.append(line)
                    elif not stripped_line:
                        unique_lines.append(line)
                
                deduplicated_text = "\n".join(unique_lines)
                formatted_text = self.reformat_numeric_blocks(deduplicated_text)
                results[name] = formatted_text
                
                doc.close()
                os.unlink(tmp_file_path)
                
                progress_bar.progress((i + 1) / total_files)
                
            except Exception as e:
                st.error(f"Error processing {uploaded_file.name}: {e}")
                continue
        
        status_text.text(f'완료: {len(results)}개 파일 처리됨')
        return results

def main():
    st.set_page_config(
        page_title="MMPI-2 데이터 추출기", 
        page_icon="📄",
        layout="wide"
    )
    
    st.title("🔍 MMPI-2 데이터 추출기 (웹버전)")
    st.markdown("---")
    
    st.markdown("""
    ### 📋 사용 방법
    1. **PDF 파일 업로드**: 여러 개의 MMPI-2 보고서 PDF를 동시에 업로드
    2. **자동 처리**: 각 파일에서 이름과 주요 데이터 추출
    3. **결과 확인**: 추출된 텍스트를 개별/통합으로 다운로드
    
    ⚠️ **주의사항**: MMPI-2 표준 보고서 형식만 지원합니다.
    """)
    
    # 사이드바에 설정
    st.sidebar.title("⚙️ 설정")
    max_files = st.sidebar.slider("최대 업로드 파일 수", 1, 20, 10)
    
    # 파일 업로드
    uploaded_files = st.file_uploader(
        "📁 MMPI-2 PDF 파일들을 선택하세요",
        type=['pdf'],
        accept_multiple_files=True,
        help="여러 개의 MMPI-2 PDF 보고서를 동시에 업로드할 수 있습니다.",
        key="pdf_uploader"
    )
    
    if uploaded_files:
        if len(uploaded_files) > max_files:
            st.warning(f"최대 {max_files}개 파일까지만 처리 가능합니다. 처음 {max_files}개 파일만 처리됩니다.")
            uploaded_files = uploaded_files[:max_files]
        
        st.success(f"📁 선택된 파일: {len(uploaded_files)}개")
        
        # 파일 목록 표시
        with st.expander("📋 선택된 파일 목록", expanded=True):
            for i, file in enumerate(uploaded_files, 1):
                file_size = len(file.getvalue()) / 1024 / 1024  # MB
                st.write(f"{i}. **{file.name}** ({file_size:.1f} MB)")
        
        # 추출 버튼
        col1, col2, col3 = st.columns([1, 1, 1])
        with col2:
            extract_button = st.button(
                "🚀 데이터 추출 시작", 
                type="primary",
                use_container_width=True
            )
        
        if extract_button:
            st.markdown("---")
            st.subheader("🔄 처리 진행 상황")
            
            with st.spinner("데이터를 추출하는 중..."):
                extractor = StreamlitMmpiExtractor()
                results = extractor.process_files(uploaded_files)
            
            if results:
                st.success(f"✅ {len(results)}개 파일 처리 완료!")
                st.markdown("---")
                
                # 결과 표시
                st.subheader("📊 추출 결과")
                
                # 다운로드 버튼들
                col1, col2, col3 = st.columns(3)
                
                with col1:
                    # 전체 결과 다운로드
                    all_content = "\n\n".join([
                        f"===== {name} =====\n{content}" 
                        for name, content in results.items()
                    ])
                    st.download_button(
                        "📋 전체 결과 다운로드",
                        all_content,
                        file_name="mmpi_results_all.txt",
                        mime="text/plain",
                        use_container_width=True
                    )
                
                with col2:
                    # AI용 데이터 다운로드 (이름 제외)
                    ai_content = "\n\n".join(results.values())
                    st.download_button(
                        "🤖 AI용 데이터 다운로드",
                        ai_content,
                        file_name="mmpi_results_for_ai.txt",
                        mime="text/plain",
                        use_container_width=True
                    )
                
                with col3:
                    # JSON 형태 다운로드
                    import json
                    json_content = json.dumps(results, ensure_ascii=False, indent=2)
                    st.download_button(
                        "📄 JSON 형태 다운로드",
                        json_content,
                        file_name="mmpi_results.json",
                        mime="application/json",
                        use_container_width=True
                    )
                
                st.markdown("---")
                
          
      # 개별 결과 표시
                st.subheader("📝 개별 결과 보기")
                
                # 탭으로 각 결과 표시
                if len(results) <= 10:  # 탭이 너무 많으면 다른 방식 사용
                    tab_names = list(results.keys())
                    tabs = st.tabs(tab_names)
                    
                    for tab, (name, content) in zip(tabs, results.items()):
                        with tab:
                            st.text_area(
                                f"📄 {name}",
                                content,
                                height=400,
                                key=f"content_{name}"
                            )
                            st.download_button(
                                f"💾 {name} 개별 다운로드",
                                content,
                                file_name=f"mmpi_{name}.txt",
                                mime="text/plain",
                                key=f"download_{name}"
                            )
                else:
                    # 많은 결과일 때는 선택박스 사용
                    selected_name = st.selectbox(
                        "결과를 선택하세요:",
                        list(results.keys())
                    )
                    
                    if selected_name:
                        st.text_area(
                            f"📄 {selected_name}",
                            results[selected_name],
                            height=400,
                            key=f"selected_content"
                        )
                        st.download_button(
                            f"💾 {selected_name} 다운로드",
                            results[selected_name],
                            file_name=f"mmpi_{selected_name}.txt",
                            mime="text/plain",
                            key=f"selected_download"
                        )
                
                # 통계 정보
                st.markdown("---")
                st.subheader("📈 처리 통계")
                col1, col2, col3, col4 = st.columns(4)
                
                with col1:
                    st.metric("처리된 파일", len(results))
                with col2:
                    total_chars = sum(len(content) for content in results.values())
                    st.metric("총 추출 문자수", f"{total_chars:,}")
                with col3:
                    avg_chars = total_chars // len(results) if results else 0
                    st.metric("평균 문자수", f"{avg_chars:,}")
                with col4:
                    st.metric("성공률", f"{len(results)}/{len(uploaded_files)}")
            
            else:
                st.error("⚠️ 처리된 파일이 없습니다. PDF 파일 형식을 확인해주세요.")
                st.info("""
                **가능한 원인:**
                - PDF 파일이 MMPI-2 표준 형식이 아님
                - PDF 파일이 손상됨
                - 페이지 수가 5페이지 미만
                """)
    
    # 푸터
    st.markdown("---")
    st.markdown("""
    <div style='text-align: center'>
        <small>
        🔒 <strong>개인정보 보호:</strong> 업로드된 파일은 처리 후 즉시 삭제됩니다.<br>
        💡 <strong>기술 지원:</strong> 문제가 있으시면 관리자에게 문의하세요.
        </small>
    </div>
    """, unsafe_allow_html=True)

if __name__ == "__main__":
    main()