avaapm · Chirping-owl · Nov 26, 2024
diff --git a/MARVEL_Download.py b/MARVEL_Download.py
@@ -1,79 +1,99 @@
+from urllib.request import Request, urlopen
+from datetime import datetime
 from bs4 import BeautifulSoup
-from urllib2 import urlopen
 from PIL import Image
 import traceback
 import threading
 import datetime
 import logging
 import codecs
 import math
-import sys
 import os
 
 ##Uncomment the related dat file ('VesselClassification.dat' for Vessel Classification, 'IMOTrainAndTest.dat' for Vessel Verification/Retrieval/Recognition tasks.)
-FILE_TO_DOWNLOAD_FROM = "VesselClassification.dat"
-##FILE_TO_DOWNLOAD_FROM = "IMOTrainAndTest.dat" 
+FILE_TO_DOWNLOAD_FROM = "IMOTrainAndTest.dat"
+##FILE_TO_DOWNLOAD_FROM = "IMOTrainAndTest.dat"
 
-NUMBER_OF_WORKERS = 10
+NUMBER_OF_WORKERS = 50
 MAX_NUM_OF_FILES_IN_FOLDER = 5000
 IMAGE_HEIGHT = 256
 IMAGE_WIDTH = 256
-ORIGINAL_SIZE = 0 # 1 for yes, 0 for no
-JUST_IMAGE = 1 # 1 for yes, 0 for no
-
-
-photoDetails = ["Photographer:","Title:","Captured:","IMO:","Photo Category:","Description:"]
-vesselIdentification = ["Name:","IMO:","Flag:","MMSI:","Callsign:"]
-technicalData = ["Vessel type:","Gross tonnage:","Summer DWT:","Length:","Beam:","Draught:"]
-additionalInformation = ["Home port:","Class society:","Build year:","Builder (*):","Owner:","Manager:"]
-aisInformation = ["Last known position:","Status:","Speed, course (heading):","Destination:","Last update:","Source:"]
-impText = photoDetails + vesselIdentification + technicalData + additionalInformation  
+ORIGINAL_SIZE = 1  # 1 for yes, 0 for no
+JUST_IMAGE = 1  # 1 for yes, 0 for no
+
+photoDetails = ["Photographer:", "Title:", "Captured:", "IMO:", "Photo Category:", "Description:"]
+vesselIdentification = ["Name:", "IMO:", "Flag:", "MMSI:", "Callsign:"]
+technicalData = ["Vessel type:", "Gross tonnage:", "Summer DWT:", "Length:", "Beam:", "Draught:"]
+additionalInformation = ["Home port:", "Class society:", "Build year:", "Builder (*):", "Owner:", "Manager:"]
+aisInformation = ["Last known position:", "Status:", "Speed, course (heading):", "Destination:", "Last update:",
+                  "Source:"]
+impText = photoDetails + vesselIdentification + technicalData + additionalInformation
 impText2 = ["Former name(s):"]
 
-sourceLink = "http://www.shipspotting.com/gallery/photo.php?lid="
+sourceLink = "https://www.shipspotting.com/photos/"
 
 logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-10s) %(message)s', )
 logging.debug("Process started at " + str(datetime.datetime.now()))
 
-def save_image(ID,justImage,outFolder):
+
+def save_image(ID, justImage, outFolder):
     url = sourceLink + ID
-    html = urlopen(url,timeout = 300).read()
-    soup = BeautifulSoup(html,"lxml")
+    print(url)
+    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
+    html = urlopen(req, timeout=600).read()
+    soup = BeautifulSoup(html, "lxml")
 
     images = [img for img in soup.findAll('img')]
     image_links = [each.get('src') for each in images]
     if not justImage:
         tags = [tr for tr in soup.findAll('td')]
         tr_text = [each.getText() for each in tags]
-        
+
     filename = " "
     for each in image_links:
-        if "http" in each and "jpg" in each and "photos/middle" in each:
-            filename=each.split('/')[-1]
-            f = urlopen(each)
-            with open(os.path.join(outFolder,filename), "wb") as local_file:
-                local_file.write(f.read())
-            if ORIGINAL_SIZE == 0:
-                img = Image.open(os.path.join(outFolder,filename)).resize((IMAGE_HEIGHT,IMAGE_WIDTH), Image.ANTIALIAS)
-                os.remove(os.path.join(outFolder,filename))
-                out = file(os.path.join(outFolder,filename),"wb")
-                img.save(out,"JPEG")
-            break
-
+        if each and isinstance(each, str) and "http" in each and "jpg" in each and "photos/big" in each:  # big middle
+            # 在这里处理有效的 image link
+            filename = each.split('/')[-1].split('?')[0]
+            req = Request(each, headers={'User-Agent': 'Mozilla/5.0'})
+            try:
+                print(f"Downloading: {each}")
+                f = urlopen(req, timeout=600)
+
+                output_path = os.path.join(outFolder, filename)
+                if not os.path.exists(outFolder):
+                    os.makedirs(outFolder)  # 创建目录，如果不存在的话
+
+                # 下载图像并保存到指定的输出文件夹
+                with open(output_path, "wb") as local_file:
+                    local_file.write(f.read())
+                if ORIGINAL_SIZE == 0:
+                    img = Image.open(os.path.join(outFolder, filename)).resize((IMAGE_HEIGHT, IMAGE_WIDTH),
+                                                                               Image.Resampling.LANCZOS)
+                    os.remove(os.path.join(outFolder, filename))
+                    out = open(os.path.join(outFolder, filename), "wb")
+                    img.save(out, "JPEG")
+
+                current_time = datetime.datetime.now().strftime("%m/%d %H:%M:%S")
+                print(f"Success ({current_time}):{each}->{outFolder}")
+                break  # only you
+
+            except Exception as e:
+                print(f"Failed to download {each}: {e}")
+
     if filename != " " and not justImage:
         textFile = filename.split('.')[0]
-        tFile = codecs.open(os.path.join(outFolder,filename)+'.dat','w','utf-8')    
-        for index,each in enumerate(tr_text):
+        tFile = codecs.open(os.path.join(outFolder, filename) + '.dat', 'w', 'utf-8')
+        for index, each in enumerate(tr_text):
             for impT in impText:
                 if impT == each:
-                    tFile.write(each + ' ' + tr_text[index+1] + '\n')
+                    tFile.write(each + ' ' + tr_text[index + 1] + '\n')
                     break
-        for index,each in enumerate(tr_text):
+        for index, each in enumerate(tr_text):
             for impT in impText2:
                 if impT == each:
-                    for ind in range(1,20):
-                        if tr_text[index+ind] != "":
-                            tFile.write(each + ' ' + tr_text[index+ind] + '\n')
+                    for ind in range(1, 20):
+                        if tr_text[index + ind] != "":
+                            tFile.write(each + ' ' + tr_text[index + ind] + '\n')
                         else:
                             break
                     break
@@ -84,22 +104,22 @@ def save_image(ID,justImage,outFolder):
         return 1
 
 
-def worker(content,workerNo):
+def worker(content, workerNo):
     workerIndex = 0
     folderIndex = 0
     folderNo = 1
-    currFolder = os.path.join(os.getcwd(),'W'+str(workerNo)+'_'+str(folderNo))
+    currFolder = os.path.join(os.getcwd(), 'W' + str(workerNo) + '_' + str(folderNo))
     if not os.path.exists(currFolder):
         os.mkdir(currFolder)
     for ID in content:
         if folderIndex == MAX_NUM_OF_FILES_IN_FOLDER:
             folderIndex = 0
             folderNo = folderNo + 1
-            currFolder = os.path.join(os.getcwd(),'W'+str(workerNo)+'_'+str(folderNo))
+            currFolder = os.path.join(os.getcwd(), 'W' + str(workerNo) + '_' + str(folderNo))
             if not os.path.exists(currFolder):
                 os.mkdir(currFolder)
         try:
-            status = save_image(ID,JUST_IMAGE,currFolder)
+            status = save_image(ID, JUST_IMAGE, currFolder)
             workerIndex = workerIndex + 1
             if status == 1:
                 folderIndex = folderIndex + 1
@@ -111,37 +131,40 @@ def worker(content,workerNo):
     logging.debug(str(datetime.datetime.now()) + "-------------- DONE ")
     return
 
+
 priorFiles = []
 dirs = os.listdir(os.getcwd())
 for eachDir in dirs:
     if 'W' in eachDir:
-        oldFiles = os.listdir(os.path.join(os.getcwd(),eachDir))
+        oldFiles = os.listdir(os.path.join(os.getcwd(), eachDir))
         for eachFile in oldFiles:
             if ".jpg" in eachFile:
                 oldID = eachFile.split(".")[0]
                 priorFiles.append(oldID)
 
-downloadFile = codecs.open(FILE_TO_DOWNLOAD_FROM,"r","utf-8")
+downloadFile = codecs.open(FILE_TO_DOWNLOAD_FROM, "r", "utf-8")
 downloadContent = downloadFile.readlines()
 downloadFile.close()
 finalContent = []
-for index,eachLine in enumerate(downloadContent):
+for index, eachLine in enumerate(downloadContent):
     temp = eachLine.split(',')[0]
     if temp not in priorFiles:
         finalContent.append(temp)
 
 numOfFiles = len(finalContent)
 
-numOfFilesPerEachWorker = [int(math.floor(float(numOfFiles)/NUMBER_OF_WORKERS)) for x in range(0,NUMBER_OF_WORKERS-1)]
-numOfFilesPerEachWorker.append(numOfFiles - (NUMBER_OF_WORKERS-1)*int(round(numOfFiles/NUMBER_OF_WORKERS,0)))
+numOfFilesPerEachWorker = [int(math.floor(float(numOfFiles) / NUMBER_OF_WORKERS)) for x in
+                           range(0, NUMBER_OF_WORKERS - 1)]
+numOfFilesPerEachWorker.append(numOfFiles - (NUMBER_OF_WORKERS - 1) * int(round(numOfFiles / NUMBER_OF_WORKERS, 0)))
 
 logging.debug("There will be %s workers in this download process" % NUMBER_OF_WORKERS)
 logging.debug("%s files will be downloaded" % numOfFiles)
 
 threads = []
 imageCount = 0
-for i in range(0,NUMBER_OF_WORKERS):
-    t = threading.Thread(name='Worker'+str(i), target=worker, args=(finalContent[imageCount:imageCount + numOfFilesPerEachWorker[i]],i,))
+for i in range(0, NUMBER_OF_WORKERS):
+    t = threading.Thread(name='Worker' + str(i), target=worker,
+                         args=(finalContent[imageCount:imageCount + numOfFilesPerEachWorker[i]], i,))
     imageCount = imageCount + numOfFilesPerEachWorker[i]
     threads.append(t)
     t.start()
@@ -150,7 +173,7 @@ def worker(content,workerNo):
 while flag:
     counter = 0
     for eachT in threads:
-        if eachT.isAlive() == False:
+        if not eachT.is_alive():
             counter = counter + 1
     if counter == NUMBER_OF_WORKERS:
         flag = False
@@ -161,42 +184,21 @@ def worker(content,workerNo):
 dirs = os.listdir(os.getcwd())
 for eachDir in dirs:
     if 'W' in eachDir:
-        FinalList = os.listdir(os.path.join(os.getcwd(),eachDir))
+        FinalList = os.listdir(os.path.join(os.getcwd(), eachDir))
         for eachFile in FinalList:
             if ".jpg" in eachFile:
-                fPath = os.path.join(os.getcwd(),eachDir,eachFile)
+                fPath = os.path.join(os.getcwd(), eachDir, eachFile)
                 fID = eachFile.split(".")[0]
                 allPaths.append(fPath)
                 allIDs.append(fID)
 logging.debug(str(datetime.datetime.now()) + " - write to disc ")
 
-FINAL = codecs.open("FINAL.dat","w","utf-8")
+FINAL = codecs.open("FINAL.dat", "w", "utf-8")
 for eachLine in downloadContent:
     tempID = eachLine.split(",")[0]
     try:
         tempIndex = allIDs.index(tempID)
-        FINAL.write(eachLine[:-1]+","+str(allPaths[tempIndex])+"\n")
+        FINAL.write(eachLine[:-1] + "," + str(allPaths[tempIndex]) + "\n")
     except:
-        FINAL.write(eachLine[:-1]+","+"-\n")
+        FINAL.write(eachLine[:-1] + "," + "-\n")
 FINAL.close()
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-