Bước 2*. Tải về từng liên kết trong danh sách sử dụng Windows

Thường cần dùng đến nếu site có sử dụng CloudFlare để chống bot (mình là bot mà). Do đó cần:

Trình duyệt thật
Ưu tiên windows
Có thể cần vượt qua captcha được (cần trình duyệt mở lên thật để thao tác)
Có một lợi thế quan trọng là không cần phải truyền cookies vào do mình dùng trình duyệt thật. Nên chỉ cần dãn cách thời gian crawl thì gần như không có cách nào mà chúng ta bị chặn.

Dưới đây hướng dẫn cả các cài đặt môi trường đến script ví dụ.

Phần A. Cài đặt môi trường

Mở PowerShell (Run as Administrator nếu cần) và làm theo từng lệnh.

Cài Python (nếu chưa có)

Download Python 3.10+ từ python.org và chọn “Add Python to PATH” khi cài.
(Nếu đã có python / py, kiểm tra:)

Tạo virtual environment (ví dụ folder project C:\projects\tvpl)

Cài thư viện cần thiết

Kiểm tra đường dẫn Chrome (mặc định Windows):

Thông thường Chrome nằm ở:
- C:\Program Files\Google\Chrome\Application\chrome.exe (64-bit)
- hoặc C:\Users\<YourUser>\AppData\Local\Google\Chrome\Application\chrome.exe
Bạn có thể kiểm tra:

Quyết định profile Chrome (dùng profile thật)

Mặc định profile Chrome nằm ở:
%LOCALAPPDATA%\Google\Chrome\User Data
Ví dụ profile Default:
C:\Users\<YourUser>\AppData\Local\Google\Chrome\User Data
Lưu ý: nếu bạn dùng profile đang mở (Chrome đang chạy), tốt nhất đóng Chrome trước khi script chạy để tránh lock; nhưng UC có thể reuse profile đang mở nếu không crash — an toàn nhất: lần đầu mở Chrome thủ công, vượt CF, đóng Chrome, rồi chạy script.

Thiết lập biến môi trường (tuỳ chọn) — hoặc chỉnh trực tiếp trong file Python:

# set PROFILE_DIR to your Chrome user-data (optional; else code uses default)

$env:PROFILE_DIR = "C:\Users\<YourUser>\AppData\Local\Google\Chrome\User Data"


# bật headful lần đầu để bạn có thể bấm qua CF:

$env:HEADLESS = "0"


# chỉ định Chrome binary (nếu cần)

$env:CHROME_BIN = "C:\Program Files\Google\Chrome\Application\chrome.exe"

7. Chậy script thôi

cd C:\projects\crawl

python 3LayFiles_Windows.py

B. SCRIPT THAM KHẢO

Script này đã điều chỉnh để sử dụng cho Windows. Vì chạy bằng python, nên về cơ bản không thay đổi gì cả. Chỉ thay đổi cài đặt trình duyệt thôi.

Khi run sẽ mở trình duyệt thật. Bạn có thể đăng nhập tài khoản trang web 1 lần duy nhất. Và cứ thế sẽ chạy!!!

Khỏe hơn bên Linux nhiều.

Lưu ý: Một số sites chỉ an toàn khi chạy 1 luồng crawl. Nếu cần chạy nhiều hơn crawl, cần thay đổi IP. Việc này cần dùng proxy. Bạn có thể tham khảo cách sử dụng thêm proxy trong bài tiếp theo.

# 3LayFiles_Windows.py
# -*- coding: utf-8 -*-
import os
import re
import csv
import time
import json
import random
import mimetypes
import requests
import pandas as pd
from urllib.parse import urljoin, unquote
from datetime import datetime

# Selenium + UC
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
TimeoutException,
WebDriverException,
NoSuchWindowException,
)

# ============== CẤU HÌNH (chỉnh nếu cần) ==============
LOG_FILE = os.environ.get("LOG_FILE", "3LayFiles_Windows.log")
OUTPUT_CSV = os.environ.get("OUTPUT_CSV", "3LayFiles_Windows-Output.csv")
COOKIE_FILE = os.environ.get("COOKIE_FILE", "3LayFiles.json") # optional
INPUT_CSV = os.environ.get("INPUT_CSV", "3LayFiles_Windows-Input.csv")
BASE_URL = "https://WebSiteNguonLayData.vn"
SAVE_ROOT = os.environ.get("SAVE_ROOT", "files_Windows")

# HEADLESS đọc từ env; mặc định 1 (headless). Lần đầu nên đặt HEADLESS=0 để thao tác qua CF.
HEADLESS = os.environ.get("HEADLESS", "1").lower() not in ("0", "false", "no")

# PROFILE_DIR: profile riêng cho crawler (KHÔNG xoá khóa)
PROFILE_DIR = os.environ.get("PROFILE_DIR") or os.path.join(
os.environ.get("LOCALAPPDATA", r"C:\Users\Default\AppData\Local"),
"Google", "Chrome", "User Data TVPL"
)

# CHROME_BIN: trỏ tới chrome.exe
CHROME_BIN = os.environ.get("CHROME_BIN", r"C:\Program Files\Google\Chrome\Application\chrome.exe")

# Nếu CF chặn ở headless, có thể yêu cầu bạn chạy headful
FORCE_HEADFUL_ON_CF = True

# UA thực lấy từ trình duyệt
CURRENT_UA = None

# ============== TIỆN ÍCH ==============
def write_log(msg: str):
ts = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
line = f"{ts} {msg}"
with open(LOG_FILE, "a", encoding="utf-8") as f:
f.write(line + "\n")
print(line)

def to01(x):
try:
return 1 if int(str(x).strip() or "0") == 1 else 0
except:
return 1 if str(x).strip().lower() in ("true", "yes", "y") else 0

def load_cookies_from_json(path: str):
if not os.path.exists(path):
return []
try:
with open(path, "r", encoding="utf-8") as f:
raw = json.load(f)
except Exception as e:
write_log(f"⚠️ Không đọc được COOKIE_FILE: {e}")
return []
names = {c.get("name") for c in raw}
if "cf_clearance" not in names:
write_log("⚠️ COOKIE_FILE không có cf_clearance — chỉ dùng như cookie phụ, không dùng để vượt CF.")
return raw

def guess_extension(href: str, content_type: str):
ext = mimetypes.guess_extension(content_type or "")
if ext and ext != ".bin":
return ext
href = unquote((href or "").lower())
if ".pdf" in href: return ".pdf"
if ".docx" in href: return ".docx"
if ".doc" in href: return ".doc"
return ".bin"

def clean_filename(text: str):
text = re.sub(r"[^\w\s\-\.]", "", text or "")
text = re.sub(r"\s+", "_", text.strip())
return (text or "file")[:120]

def unique_path(path: str):
if not os.path.exists(path):
return path
base, ext = os.path.splitext(path)
i = 1
while True:
cand = f"{base}_{i}{ext}"
if not os.path.exists(cand):
return cand
i += 1

# ============== DRIVER (UC) — KHÔNG xoá khóa profile, giữ nguyên profile lock ==============
def make_driver(headless=None, max_attempts=3):
"""Khởi tạo UC Chrome (retry nếu cần). KHÔNG xoá Singleton*/LOCK để giữ nguyên profile lock."""
global CURRENT_UA
if headless is None:
headless = HEADLESS

write_log(f"🔧 Sử dụng Chrome profile: {PROFILE_DIR}")
write_log(f"🧭 Dùng Chrome binary: {CHROME_BIN}")
write_log(f"📁 PROFILE_DIR exists={os.path.exists(PROFILE_DIR)}")

last_err = None
for attempt in range(1, max_attempts + 1):
try:
opts = uc.ChromeOptions()
if headless:
opts.add_argument("--headless=new")
# Stabilizing flags
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--disable-gpu")
opts.add_argument("--window-size=1920,1080")
opts.add_argument("--disable-features=TranslateUI,AutomationControlled")
opts.add_argument("--no-first-run")
opts.add_argument("--no-default-browser-check")
opts.add_argument("--disable-extensions")
opts.add_argument("--disable-software-rasterizer")
opts.add_argument("--disable-background-networking")
opts.add_argument("--disable-sync")
# giảm fingerprint automation
opts.add_argument("--disable-blink-features=AutomationControlled")
# ưu tiên ngôn ngữ
opts.add_argument("--lang=vi-VN,vi;q=0.9,en-US;q=0.8,en;q=0.7")

# gắn profile — KHÔNG đụng file khóa
os.makedirs(PROFILE_DIR, exist_ok=True)
opts.add_argument(f"--user-data-dir={PROFILE_DIR}")
opts.add_argument("--profile-directory=Default")

# cố định devtools port
opts.add_argument("--remote-debugging-port=9222")

if CHROME_BIN and os.path.exists(CHROME_BIN):
opts.binary_location = CHROME_BIN
else:
write_log("⚠️ CHROME_BIN không tồn tại; UC sẽ tự tìm Chrome.")

write_log(f"🚀 Khởi tạo UC Chrome (attempt {attempt}/{max_attempts}) | headless={headless}")
driver = uc.Chrome(options=opts, use_subprocess=True)

# stealth patches
try:
driver.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{"source": """
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
window.chrome = window.chrome || { runtime: {} };
Object.defineProperty(navigator, 'plugins', {get: () => [1,2,3,4,5]});
Object.defineProperty(navigator, 'languages', {get: () => ['vi-VN','vi','en-US','en']});
"""}
)
except Exception:
pass

# Lấy UA thực
try:
ua = driver.execute_script("return navigator.userAgent")
CURRENT_UA = ua
write_log(f"🧾 User-Agent thực: {ua}")
except Exception:
write_log("⚠️ Không lấy được User-Agent từ driver")

caps = driver.capabilities
write_log(f"✅ UC WebDriver OK: {caps.get('browserName')} {caps.get('browserVersion')}, headless={headless}")
return driver

except Exception as e:
last_err = e
write_log(f"⏳ Attempt {attempt} thất bại: {e}")
time.sleep(2.5)
if headless and attempt == 1:
write_log("🔁 Chuyển tạm sang headful để khởi động Chrome cho chắc…")
headless = False

raise RuntimeError(f"Không khởi tạo được UC Chrome sau {max_attempts} lần: {last_err}")

# ============== CLOUDFLARE ==============
def wait_cloudflare_clearance(driver, max_wait=60, require_cookie=False):
"""Qua CF khi KHÔNG còn title 'Just a moment' / 'Checking your browser'. Không bắt buộc cf_clearance."""
start = time.time()
stable_ok_for = 0.0
last_title = ""
while time.time() - start < max_wait:
title = (driver.title or "").strip()
has_clearance = any(c.get("name") == "cf_clearance" for c in driver.get_cookies())
not_cf_title = "just a moment" not in title.lower() and "checking your browser" not in title.lower()
ok = not_cf_title and (has_clearance or not require_cookie)
write_log(f"🔍 CF check | title='{title[:60]}' | cf_clearance={has_clearance}")
if ok:
stable_ok_for = stable_ok_for + 0.5 if title == last_title else 0.0
if stable_ok_for >= 1.0:
return True
last_title = title
time.sleep(0.5)
return False

# ============== MỞ TAB / TÌM LINK ==============
def open_download_tab_or_fallback(driver, uuid, timeout_tab=12):
selectors = [
(By.ID, "aTabTaiVe"),
(By.CSS_SELECTOR, 'a[href="#tab8"]'),
(By.CSS_SELECTOR, 'a[data-target="#tab8"]'),
(By.XPATH, "//a[contains(normalize-space(.),'Tải về') or contains(@title,'Tải về')]"),
(By.XPATH, "//li[a[contains(normalize-space(.),'Tải về')]]/a"),
]
for by, sel in selectors:
try:
el = WebDriverWait(driver, 6).until(EC.element_to_be_clickable((by, sel)))
driver.execute_script("arguments[0].scrollIntoView({block:'center'});", el)
time.sleep(0.2)
driver.execute_script("arguments[0].click();", el)
try:
WebDriverWait(driver, timeout_tab).until(EC.presence_of_element_located((By.CSS_SELECTOR, '#tab8')))
except TimeoutException:
pass
links = driver.find_elements(By.CSS_SELECTOR, '#tab8 a[href*="download.aspx"]')
if links:
return links, False
except Exception:
continue

links_all = driver.find_elements(By.CSS_SELECTOR, 'a[href*="download.aspx"]')
if links_all:
write_log(f"🔎 {uuid}: Fallback CSS toàn trang: {len(links_all)} link")
return links_all, True

links_all2 = driver.find_elements(By.XPATH, "//a[contains(translate(@href,'DOWNLOAD','download'),'download.aspx')]")
if links_all2:
write_log(f"🔎 {uuid}: Fallback XPath toàn trang: {len(links_all2)} link")
return links_all2, True

return [], False

# ============== TẢI FILE ==============
def classify_and_save_file(href, link_text, session, uuid, word_counter):
full_url = urljoin(BASE_URL, href)
headers = {"User-Agent": session.headers.get("User-Agent", ""), "Referer": BASE_URL}
try:
resp = session.get(full_url, headers=headers, timeout=120, stream=True)
if resp.status_code != 200:
write_log(f"❌ {uuid}: Lỗi tải {full_url} (HTTP {resp.status_code})")
return False, word_counter

cd = resp.headers.get("Content-Disposition", "") or ""
filename = None
m = re.search(r'filename="?([^";]+)"?', cd)
if m:
filename = m.group(1)

if not filename:
ext = guess_extension(href, resp.headers.get("Content-Type"))
href_lower = (href or "").lower()
if "vietnamesehyperlink" in href_lower or (ext == ".doc" and "part=-1" in href_lower):
word_counter += 1
filename = f"w{uuid}_{word_counter}{ext}"
else:
filename = clean_filename(link_text) + ext

ext = os.path.splitext(filename)[-1].lower()
subfolder = "pdf" if ext == ".pdf" else ("doc" if ext in [".doc", ".docx"] else "other")
folder = os.path.join(SAVE_ROOT, str(uuid), subfolder)
os.makedirs(folder, exist_ok=True)
path = unique_path(os.path.join(folder, filename))

with open(path, "wb") as f:
for chunk in resp.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)

write_log(f"✅ {uuid}: Đã tải {path}")
return True, word_counter
except Exception as e:
write_log(f"❌ {uuid}: Exception khi tải {full_url} - {e}")
return False, word_counter

# ============== LUỒNG 1 DÒNG (dùng driver đã tạo sẵn) ==============
def ensure_clearance(driver, url, max_wait=60):
try:
driver.get(BASE_URL)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
except Exception:
pass
driver.get(url)
WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
return wait_cloudflare_clearance(driver, max_wait=max_wait)

def download_files_for_row(uuid, url, cookies, driver):
"""
Dùng driver đang mở (không tạo/quitting mỗi link).
KHÔNG xoá khóa profile.
"""
error_count = 0
word_counter = 0
success = True
try:
# Đảm bảo qua CF cho URL này
if not ensure_clearance(driver, url, max_wait=120):
write_log(f"❌ {uuid}: CF chặn (headless={HEADLESS}).")
if FORCE_HEADFUL_ON_CF and HEADLESS:
write_log("🔁 Yêu cầu chạy headful để xác thực (đặt HEADLESS=0) rồi chạy lại.")
return False
else:
write_log(f"✅ {uuid}: Qua Cloudflare.")

# Lấy link tải
links, used_fallback = open_download_tab_or_fallback(driver, uuid)
if not links:
write_log(f"❌ {uuid}: Không tìm thấy tab/link Tải về (kể cả fallback)")
return False
source = "fallback" if used_fallback else "tab8"
write_log(f"🔗 {uuid}: Tìm thấy {len(links)} link (nguồn: {source})")

# Tạo session theo UA & cookie thật
session = requests.Session()
real_ua = CURRENT_UA or "Mozilla/5.0"
session.headers.update({
"User-Agent": real_ua,
"Referer": BASE_URL,
"Accept-Language": "vi-VN,vi;q=0.9,en-US;q=0.8,en;q=0.7",
})
for ck in driver.get_cookies():
try:
session.cookies.set(ck["name"], ck["value"], domain=".WebSiteNguonLayData.vn")
except Exception:
pass

# Tải từng link
for link in links:
href = link.get_attribute("href")
text = (link.text or "").strip()
ok, word_counter = classify_and_save_file(href, text, session, uuid, word_counter)
if not ok:
error_count += 1
if error_count >= 3:
write_log(f"⚠️ {uuid}: Lỗi liên tiếp 3 file – bỏ dòng này")
success = False
break
time.sleep(random.uniform(5, 12))

except (NoSuchWindowException, WebDriverException, TimeoutException) as e:
write_log(f"❌ {uuid}: Lỗi WebDriver - {e}")
success = False
except Exception as e:
write_log(f"❌ {uuid}: Lỗi không xác định - {e}")
success = False

return success

# ============== LUỒNG CHÍNH ==============
def run():
# Log cấu hình để khỏi nhầm file
write_log(f"CONFIG -> INPUT_CSV={os.path.abspath(INPUT_CSV)} | OUTPUT_CSV={os.path.abspath(OUTPUT_CSV)} | LOG_FILE={os.path.abspath(LOG_FILE)} | SAVE_ROOT={os.path.abspath(SAVE_ROOT)}")

if not os.path.exists(INPUT_CSV):
write_log(f"⛔ Không tìm thấy INPUT_CSV: {INPUT_CSV}")
raise FileNotFoundError(INPUT_CSV)

df = pd.read_csv(INPUT_CSV, encoding="utf-8")
df = df.loc[:, ~df.columns.str.contains('^Unnamed', case=False, na=False)]
for col in ["uuid", "url", "TaiVe"]:
if col not in df.columns:
raise RuntimeError("Thiếu cột bắt buộc: uuid/url/TaiVe")
df["uuid"] = df["uuid"].astype(str).str.strip()
df["url"] = df["url"].astype(str).str.strip()
df["TaiVe"] = df["TaiVe"].map(to01).fillna(0).astype(int)

cookies = load_cookies_from_json(COOKIE_FILE)
write_log(f"📊 Tổng dòng: {len(df)} | Sẽ xử lý TaiVe==0: {(df['TaiVe'] == 0).sum()}")

os.makedirs(SAVE_ROOT, exist_ok=True)

# ✅ Tạo 1 driver DUY NHẤT, giữ mở cho tới khi xong toàn bộ
driver = make_driver(headless=HEADLESS)

fieldnames = list(df.columns)
with open(OUTPUT_CSV, "w", newline='', encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()

try:
for _, row in df.iterrows():
uuid = str(row["uuid"]).strip()
url = str(row["url"]).strip()

if row["TaiVe"] == 0:
write_log(f"🚀 Bắt đầu xử lý {uuid}")
ok = download_files_for_row(uuid, url, cookies, driver=driver)
row["TaiVe"] = 1 if ok else 0
write_log(f"📝 Cập nhật {uuid}: TaiVe={row['TaiVe']}")
sleep_time = random.uniform(60, 150)
write_log(f"🕒 Nghỉ {sleep_time:.2f} giây...")
time.sleep(sleep_time)
else:
write_log(f"⏭️ Bỏ qua {uuid} (TaiVe=1) – ghi nguyên trạng")

with open(OUTPUT_CSV, "a", newline='', encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writerow({k: row.get(k, "") for k in fieldnames})
finally:
# ✅ Chỉ đóng Chrome khi KẾT THÚC TOÀN BỘ batch
try:
driver.quit()
except Exception:
pass

if __name__ == "__main__":
os.makedirs(SAVE_ROOT, exist_ok=True)
run()

Đăng nhập để gửi ý kiến

Bước 2*. Tải về từng liên kết trong danh sách sử dụng Windows

Phần A. Cài đặt môi trường

B. SCRIPT THAM KHẢO

Mac

Bài nổi bật

Crawler, Spider, Scraper

Câu hỏi, thảo luận