d

2025-12-05 03:42:57 -08:00
parent 5e4df11dbf
commit 5482ee5586
20 changed files with 911 additions and 223 deletions
--- a/cmdlets/download_data.py
+++ b/cmdlets/download_data.py
@@ -41,7 +41,8 @@ from config import resolve_output_dir
 from metadata import (
    fetch_openlibrary_metadata_tags,
    format_playlist_entry,
-    extract_ytdlp_tags
+    extract_ytdlp_tags,
+    build_book_tags,
 )

 # ============================================================================
@@ -1499,12 +1500,19 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any], emit_results:
                        metadata = item.get('full_metadata', {}) if isinstance(item.get('full_metadata'), dict) else {}
                        mirrors = metadata.get('mirrors', {})
                        book_id = metadata.get('book_id', '')
+                        author = metadata.get('author')
+                        isbn_val = metadata.get('isbn')
+                        year_val = metadata.get('year')
                        
                        if url:
                            url_entry = {
                                'url': str(url),
                                'mirrors': mirrors,  # Alternative mirrors for fallback
                                'book_id': book_id,
+                                'title': title,
+                                'author': author,
+                                'isbn': isbn_val,
+                                'year': year_val,
                            }
                            urls_to_download.append(url_entry)
                            debug(f"[search-result] LibGen: '{title}'")
@@ -1700,12 +1708,19 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any], emit_results:
                        metadata = getattr(item, 'full_metadata', {}) if isinstance(getattr(item, 'full_metadata', None), dict) else {}
                        mirrors = metadata.get('mirrors', {})
                        book_id = metadata.get('book_id', '')
+                        author = metadata.get('author')
+                        isbn_val = metadata.get('isbn')
+                        year_val = metadata.get('year')
                        
                        if url:
                            url_entry = {
                                'url': str(url),
                                'mirrors': mirrors,  # Alternative mirrors for fallback
                                'book_id': book_id,
+                                'title': title,
+                                'author': author,
+                                'isbn': isbn_val,
+                                'year': year_val,
                            }
                            urls_to_download.append(url_entry)
                        else:
@@ -2177,6 +2192,10 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any], emit_results:
                    primary_url = url.get('url')
                    mirrors_dict = url.get('mirrors', {})
                    book_id = url.get('book_id', '')
+                    title_val = url.get('title')
+                    author_val = url.get('author')
+                    isbn_val = url.get('isbn')
+                    year_val = url.get('year')
                    
                    if not primary_url:
                        debug(f"Skipping libgen entry: no primary URL")
@@ -2219,39 +2238,82 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any], emit_results:
                            
                            # Use libgen_service's download_from_mirror for proper libgen handling
                            from helper.libgen_service import download_from_mirror
-                            
+                    
                            # Generate filename from book_id and title
                            safe_title = "".join(c for c in str(title or "book") if c.isalnum() or c in (' ', '.', '-'))[:100]
                            file_path = final_output_dir / f"{safe_title}_{book_id}.pdf"
-                            
+                    
+                            progress_bar = models.ProgressBar()
+                            progress_start = time.time()
+                            last_update = [progress_start]
+                            progress_bytes = [0]
+                            progress_total = [0]
+
+                            def _libgen_progress(downloaded: int, total: int) -> None:
+                                progress_bytes[0] = downloaded
+                                progress_total[0] = total
+                                now = time.time()
+                                if total > 0 and now - last_update[0] >= 0.5:
+                                    percent = (downloaded / total) * 100
+                                    elapsed = max(now - progress_start, 1e-6)
+                                    speed = downloaded / elapsed if elapsed > 0 else 0
+                                    remaining = max(total - downloaded, 0)
+                                    eta = remaining / speed if speed > 0 else 0
+                                    minutes, seconds = divmod(int(eta), 60)
+                                    hours, minutes = divmod(minutes, 60)
+                                    eta_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+                                    speed_str = f"{progress_bar.format_bytes(speed)}/s"
+                                    progress_line = progress_bar.format_progress(
+                                        percent_str=f"{percent:.1f}%",
+                                        downloaded=downloaded,
+                                        total=total,
+                                        speed_str=speed_str,
+                                        eta_str=eta_str,
+                                    )
+                                    debug(f"    {progress_line}")
+                                    last_update[0] = now
+
                            # Attempt download using libgen's native function
-                            success = download_from_mirror(
+                            success, downloaded_path = download_from_mirror(
                                mirror_url=mirror_url,
                                output_path=file_path,
                                log_info=lambda msg: debug(f"    {msg}"),
-                                log_error=lambda msg: debug(f"    ⚠ {msg}")
+                                log_error=lambda msg: debug(f"    ⚠ {msg}"),
+                                progress_callback=_libgen_progress,
                            )
-                            
-                            if success and file_path.exists():
+
+                            final_path = Path(downloaded_path) if downloaded_path else file_path
+                            if success and final_path.exists():
+                                downloaded = progress_bytes[0] or final_path.stat().st_size
+                                elapsed = time.time() - progress_start
+                                avg_speed = downloaded / elapsed if elapsed > 0 else 0
+                                debug(f"    ✓ Downloaded in {elapsed:.1f}s at {progress_bar.format_bytes(avg_speed)}/s")
                                debug(f"  ✓ Downloaded successfully from mirror #{mirror_idx}")
                                successful_mirror = mirror_url
                                download_succeeded = True
-                                
+                        
                                # Emit result for downstream cmdlets
-                                file_hash = _compute_file_hash(file_path)
-                                emit_tags = ['libgen', 'book']
-                                
+                                file_hash = _compute_file_hash(final_path)
+                                emit_tags = build_book_tags(
+                                    title=title_val or title,
+                                    author=author_val,
+                                    isbn=isbn_val,
+                                    year=year_val,
+                                    source='libgen',
+                                    extra=[f"libgen_id:{book_id}"] if book_id else None,
+                                )
+
                                pipe_obj = create_pipe_object_result(
                                    source='libgen',
                                    identifier=book_id,
-                                    file_path=str(file_path),
+                                    file_path=str(final_path),
                                    cmdlet_name='download-data',
                                    file_hash=file_hash,
                                    tags=emit_tags,
                                    source_url=successful_mirror
                                )
                                pipeline_context.emit(pipe_obj)
-                                downloaded_files.append(str(file_path))
+                                downloaded_files.append(str(final_path))
                                exit_code = 0
                                break  # Success, stop trying mirrors
                        
@@ -2643,38 +2705,61 @@ def _run(result: Any, args: Sequence[str], config: Dict[str, Any], emit_results:
                    
                    # Let's try to get metadata to make a good filename
                    filename = "libgen_download.bin"
+                    title_from_results = None
+                    author_from_results = None
+                    year_from_results = None
                    if libgen_id and results:
-                        title = results[0].get("title", "book")
+                        title_from_results = results[0].get("title")
+                        author_from_results = results[0].get("author")
+                        year_from_results = results[0].get("year")
                        ext = results[0].get("extension", "pdf")
                        # Sanitize filename
-                        safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '-', '_')).strip()
+                        safe_title = "".join(c for c in (title_from_results or "book") if c.isalnum() or c in (' ', '-', '_')).strip()
                        filename = f"{safe_title}.{ext}"
                    elif "series.php" in url:
                         filename = f"series_{re.search(r'id=(\d+)', url).group(1) if re.search(r'id=(\d+)', url) else 'unknown'}.pdf"
                    
                    output_path = final_output_dir / filename
                    
-                    if download_from_mirror(url, output_path, log_info=debug, log_error=log):
-                        debug(f"✓ LibGen download successful: {output_path}")
-                        
+                    success, downloaded_path = download_from_mirror(
+                        url,
+                        output_path,
+                        log_info=debug,
+                        log_error=log,
+                    )
+                    final_file = Path(downloaded_path) if downloaded_path else output_path
+                    if success and final_file.exists():
+                        debug(f"✓ LibGen download successful: {final_file}")
+                    
                        # Create a result object
                        info = {
                            "id": libgen_id or "libgen",
                            "title": filename,
                            "webpage_url": url,
-                            "ext": output_path.suffix.lstrip("."),
+                            "ext": final_file.suffix.lstrip("."),
                        }
-                        
+
+                        emit_tags = build_book_tags(
+                            title=title_from_results or filename,
+                            author=author_from_results,
+                            year=year_from_results,
+                            source="libgen",
+                            extra=[f"libgen_id:{libgen_id}"] if libgen_id else None,
+                        )
+                        file_hash = _compute_file_hash(final_file)
+
                        # Emit result
                        pipeline_context.emit(create_pipe_object_result(
                            source="libgen",
                            identifier=libgen_id or "libgen",
-                            file_path=str(output_path),
+                            file_path=str(final_file),
                            cmdlet_name="download-data",
                            title=filename,
+                            file_hash=file_hash,
+                            tags=emit_tags,
                            extra=info
                        ))
-                        downloaded_files.append(str(output_path))
+                        downloaded_files.append(str(final_file))
                        continue
                    else:
                        debug("⚠ LibGen specialized download failed, falling back to generic downloader...")