import streamlit as st
import fitz  # PyMuPDF
import re
import os
import tempfile

class MmpiExtractor:
    def extract_name(self, page):
        try:
            text = page.get_text("text")
            match = re.search(r"(?:이름|성명)\s*:\s*(\S+)", text)
            if match: 
                return match.group(1)
            blocks = page.get_text("blocks")
            if blocks: 
                return blocks[0][4].strip().split('\n')[0]
        except Exception: 
            pass
        return "이름모름"

    def reformat_numeric_blocks(self, text):
        lines = text.split('\n')
        if not lines: 
            return ""
        new_lines = [lines[0]]
        for i in range(1, len(lines)):
            line, stripped_line = lines[i], lines[i].strip()
            is_numeric = False
            if stripped_line and ' ' not in stripped_line:
                try: 
                    float(stripped_line)
                    is_numeric = True
                except ValueError: 
                    is_numeric = False
            if is_numeric and new_lines and new_lines[-1].strip():
                new_lines[-1] += f" {stripped_line}"
            else:
                new_lines.append(line)
        return "\n".join(new_lines)

    def process_files(self, uploaded_files):
        results = {}
        pages_to_extract = [1, 2, 3, 4]
        
        progress_bar = st.progress(0)
        total_files = len(uploaded_files)
        
        for i, uploaded_file in enumerate(uploaded_files):
            try:
                # 임시 파일로 저장
                with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
                    tmp_file.write(uploaded_file.getvalue())
                    tmp_file_path = tmp_file.name
                
                doc = fitz.open(tmp_file_path)
                if doc.page_count < 5:
                    os.unlink(tmp_file_path)
                    continue
                    
                name = self.extract_name(doc.load_page(0))
                if name in results:
                    name = f"{name}_{i+1}"
                
                # 모든 페이지 텍스트 합치기
                combined_raw_text = ""
                for page_num in pages_to_extract:
                    combined_raw_text += doc.load_page(page_num).get_text("text") + "\n"
                
                # 중복 라인 제거
                lines = combined_raw_text.split('\n')
                unique_lines = []
                seen_lines = set()
                for line in lines:
                    stripped_line = line.strip()
                    if stripped_line and stripped_line not in seen_lines:
                        seen_lines.add(stripped_line)
                        unique_lines.append(line)
                    elif not stripped_line:
                        unique_lines.append(line)
                
                deduplicated_text = "\n".join(unique_lines)
                formatted_text = self.reformat_numeric_blocks(deduplicated_text)
                results[name] = formatted_text
                
                doc.close()
                os.unlink(tmp_file_path)
                
                progress_bar.progress((i + 1) / total_files)
                
            except Exception as e:
                st.error(f"Error processing {uploaded_file.name}: {e}")
                continue
        
        return results

def main():
    st.set_page_config(page_title="MMPI-2 데이터 추출기", page_icon="📄")
    
    st.title("🔍 MMPI-2 데이터 추출기")
    st.write("MMPI-2 PDF 보고서에서 데이터를 추출합니다.")
    
    uploaded_files = st.file_uploader(
        "PDF 파일들을 선택하세요",
        type=['pdf'],
        accept_multiple_files=True,
        help="여러 개의 MMPI-2 PDF 보고서를 동시에 업로드할 수 있습니다."
    )
    
    if uploaded_files:
        st.write(f"📁 선택된 파일: {len(uploaded_files)}개")
        
        for file in uploaded_files:
            st.write(f"• {file.name}")
        
        if st.button("🚀 데이터 추출 시작", type="primary"):
            with st.spinner("데이터를 추출하는 중..."):
                extractor = MmpiExtractor()
                results = extractor.process_files(uploaded_files)
            
            if results:
                st.success(f"✅ {len(results)}개 파일 처리 완료!")
                
                # 전체 복사 버튼들
                col1, col2 = st.columns(2)
                with col1:
                    all_content = "\n\n".join([f"---{name}---\n{content}" for name, content in results.items()])
                    st.download_button(
                        "📋 전체 결과 다운로드",
                        all_content,
                        file_name="mmpi_results_all.txt",
                        mime="text/plain"
                    )
                with col2:
                    ai_content = "\n\n".join(results.values())
                    st.download_button(
                        "🤖 AI용 데이터 다운로드",
                        ai_content,
                        file_name="mmpi_results_for_ai.txt",
                        mime="text/plain"
                    )
                
                # 개별 결과 표시
                for name, content in results.items():
                    with st.expander(f"📄 {name}"):
                        st.text_area(
                            f"내용 ({name})",
                            content,
                            height=300,
                            key=f"content_{name}"
                        )
                        st.download_button(
                            f"💾 {name} 다운로드",
                            content,
                            file_name=f"mmpi_{name}.txt",
                            mime="text/plain",
                            key=f"download_{name}"
                        )
            else:
                st.warning("⚠️ 처리된 파일이 없습니다. PDF 파일 형식을 확인해주세요.")

if __name__ == "__main__":
    main()