Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 79 additions & 77 deletions MARVEL_Download.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,99 @@
from urllib.request import Request, urlopen
from datetime import datetime
from bs4 import BeautifulSoup
from urllib2 import urlopen
from PIL import Image
import traceback
import threading
import datetime
import logging
import codecs
import math
import sys
import os

##Uncomment the related dat file ('VesselClassification.dat' for Vessel Classification, 'IMOTrainAndTest.dat' for Vessel Verification/Retrieval/Recognition tasks.)
FILE_TO_DOWNLOAD_FROM = "VesselClassification.dat"
##FILE_TO_DOWNLOAD_FROM = "IMOTrainAndTest.dat"
FILE_TO_DOWNLOAD_FROM = "IMOTrainAndTest.dat"
##FILE_TO_DOWNLOAD_FROM = "IMOTrainAndTest.dat"

NUMBER_OF_WORKERS = 10
NUMBER_OF_WORKERS = 50
MAX_NUM_OF_FILES_IN_FOLDER = 5000
IMAGE_HEIGHT = 256
IMAGE_WIDTH = 256
ORIGINAL_SIZE = 0 # 1 for yes, 0 for no
JUST_IMAGE = 1 # 1 for yes, 0 for no


photoDetails = ["Photographer:","Title:","Captured:","IMO:","Photo Category:","Description:"]
vesselIdentification = ["Name:","IMO:","Flag:","MMSI:","Callsign:"]
technicalData = ["Vessel type:","Gross tonnage:","Summer DWT:","Length:","Beam:","Draught:"]
additionalInformation = ["Home port:","Class society:","Build year:","Builder (*):","Owner:","Manager:"]
aisInformation = ["Last known position:","Status:","Speed, course (heading):","Destination:","Last update:","Source:"]
impText = photoDetails + vesselIdentification + technicalData + additionalInformation
ORIGINAL_SIZE = 1 # 1 for yes, 0 for no
JUST_IMAGE = 1 # 1 for yes, 0 for no

photoDetails = ["Photographer:", "Title:", "Captured:", "IMO:", "Photo Category:", "Description:"]
vesselIdentification = ["Name:", "IMO:", "Flag:", "MMSI:", "Callsign:"]
technicalData = ["Vessel type:", "Gross tonnage:", "Summer DWT:", "Length:", "Beam:", "Draught:"]
additionalInformation = ["Home port:", "Class society:", "Build year:", "Builder (*):", "Owner:", "Manager:"]
aisInformation = ["Last known position:", "Status:", "Speed, course (heading):", "Destination:", "Last update:",
"Source:"]
impText = photoDetails + vesselIdentification + technicalData + additionalInformation
impText2 = ["Former name(s):"]

sourceLink = "http://www.shipspotting.com/gallery/photo.php?lid="
sourceLink = "https://www.shipspotting.com/photos/"

logging.basicConfig(level=logging.DEBUG, format='(%(threadName)-10s) %(message)s', )
logging.debug("Process started at " + str(datetime.datetime.now()))

def save_image(ID,justImage,outFolder):

def save_image(ID, justImage, outFolder):
url = sourceLink + ID
html = urlopen(url,timeout = 300).read()
soup = BeautifulSoup(html,"lxml")
print(url)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req, timeout=600).read()
soup = BeautifulSoup(html, "lxml")

images = [img for img in soup.findAll('img')]
image_links = [each.get('src') for each in images]
if not justImage:
tags = [tr for tr in soup.findAll('td')]
tr_text = [each.getText() for each in tags]

filename = " "
for each in image_links:
if "http" in each and "jpg" in each and "photos/middle" in each:
filename=each.split('/')[-1]
f = urlopen(each)
with open(os.path.join(outFolder,filename), "wb") as local_file:
local_file.write(f.read())
if ORIGINAL_SIZE == 0:
img = Image.open(os.path.join(outFolder,filename)).resize((IMAGE_HEIGHT,IMAGE_WIDTH), Image.ANTIALIAS)
os.remove(os.path.join(outFolder,filename))
out = file(os.path.join(outFolder,filename),"wb")
img.save(out,"JPEG")
break

if each and isinstance(each, str) and "http" in each and "jpg" in each and "photos/big" in each: # big middle
# 在这里处理有效的 image link
filename = each.split('/')[-1].split('?')[0]
req = Request(each, headers={'User-Agent': 'Mozilla/5.0'})
try:
print(f"Downloading: {each}")
f = urlopen(req, timeout=600)

output_path = os.path.join(outFolder, filename)
if not os.path.exists(outFolder):
os.makedirs(outFolder) # 创建目录,如果不存在的话

# 下载图像并保存到指定的输出文件夹
with open(output_path, "wb") as local_file:
local_file.write(f.read())
if ORIGINAL_SIZE == 0:
img = Image.open(os.path.join(outFolder, filename)).resize((IMAGE_HEIGHT, IMAGE_WIDTH),
Image.Resampling.LANCZOS)
os.remove(os.path.join(outFolder, filename))
out = open(os.path.join(outFolder, filename), "wb")
img.save(out, "JPEG")

current_time = datetime.datetime.now().strftime("%m/%d %H:%M:%S")
print(f"Success ({current_time}):{each}->{outFolder}")
break # only you

except Exception as e:
print(f"Failed to download {each}: {e}")

if filename != " " and not justImage:
textFile = filename.split('.')[0]
tFile = codecs.open(os.path.join(outFolder,filename)+'.dat','w','utf-8')
for index,each in enumerate(tr_text):
tFile = codecs.open(os.path.join(outFolder, filename) + '.dat', 'w', 'utf-8')
for index, each in enumerate(tr_text):
for impT in impText:
if impT == each:
tFile.write(each + ' ' + tr_text[index+1] + '\n')
tFile.write(each + ' ' + tr_text[index + 1] + '\n')
break
for index,each in enumerate(tr_text):
for index, each in enumerate(tr_text):
for impT in impText2:
if impT == each:
for ind in range(1,20):
if tr_text[index+ind] != "":
tFile.write(each + ' ' + tr_text[index+ind] + '\n')
for ind in range(1, 20):
if tr_text[index + ind] != "":
tFile.write(each + ' ' + tr_text[index + ind] + '\n')
else:
break
break
Expand All @@ -84,22 +104,22 @@ def save_image(ID,justImage,outFolder):
return 1


def worker(content,workerNo):
def worker(content, workerNo):
workerIndex = 0
folderIndex = 0
folderNo = 1
currFolder = os.path.join(os.getcwd(),'W'+str(workerNo)+'_'+str(folderNo))
currFolder = os.path.join(os.getcwd(), 'W' + str(workerNo) + '_' + str(folderNo))
if not os.path.exists(currFolder):
os.mkdir(currFolder)
for ID in content:
if folderIndex == MAX_NUM_OF_FILES_IN_FOLDER:
folderIndex = 0
folderNo = folderNo + 1
currFolder = os.path.join(os.getcwd(),'W'+str(workerNo)+'_'+str(folderNo))
currFolder = os.path.join(os.getcwd(), 'W' + str(workerNo) + '_' + str(folderNo))
if not os.path.exists(currFolder):
os.mkdir(currFolder)
try:
status = save_image(ID,JUST_IMAGE,currFolder)
status = save_image(ID, JUST_IMAGE, currFolder)
workerIndex = workerIndex + 1
if status == 1:
folderIndex = folderIndex + 1
Expand All @@ -111,37 +131,40 @@ def worker(content,workerNo):
logging.debug(str(datetime.datetime.now()) + "-------------- DONE ")
return


priorFiles = []
dirs = os.listdir(os.getcwd())
for eachDir in dirs:
if 'W' in eachDir:
oldFiles = os.listdir(os.path.join(os.getcwd(),eachDir))
oldFiles = os.listdir(os.path.join(os.getcwd(), eachDir))
for eachFile in oldFiles:
if ".jpg" in eachFile:
oldID = eachFile.split(".")[0]
priorFiles.append(oldID)

downloadFile = codecs.open(FILE_TO_DOWNLOAD_FROM,"r","utf-8")
downloadFile = codecs.open(FILE_TO_DOWNLOAD_FROM, "r", "utf-8")
downloadContent = downloadFile.readlines()
downloadFile.close()
finalContent = []
for index,eachLine in enumerate(downloadContent):
for index, eachLine in enumerate(downloadContent):
temp = eachLine.split(',')[0]
if temp not in priorFiles:
finalContent.append(temp)

numOfFiles = len(finalContent)

numOfFilesPerEachWorker = [int(math.floor(float(numOfFiles)/NUMBER_OF_WORKERS)) for x in range(0,NUMBER_OF_WORKERS-1)]
numOfFilesPerEachWorker.append(numOfFiles - (NUMBER_OF_WORKERS-1)*int(round(numOfFiles/NUMBER_OF_WORKERS,0)))
numOfFilesPerEachWorker = [int(math.floor(float(numOfFiles) / NUMBER_OF_WORKERS)) for x in
range(0, NUMBER_OF_WORKERS - 1)]
numOfFilesPerEachWorker.append(numOfFiles - (NUMBER_OF_WORKERS - 1) * int(round(numOfFiles / NUMBER_OF_WORKERS, 0)))

logging.debug("There will be %s workers in this download process" % NUMBER_OF_WORKERS)
logging.debug("%s files will be downloaded" % numOfFiles)

threads = []
imageCount = 0
for i in range(0,NUMBER_OF_WORKERS):
t = threading.Thread(name='Worker'+str(i), target=worker, args=(finalContent[imageCount:imageCount + numOfFilesPerEachWorker[i]],i,))
for i in range(0, NUMBER_OF_WORKERS):
t = threading.Thread(name='Worker' + str(i), target=worker,
args=(finalContent[imageCount:imageCount + numOfFilesPerEachWorker[i]], i,))
imageCount = imageCount + numOfFilesPerEachWorker[i]
threads.append(t)
t.start()
Expand All @@ -150,7 +173,7 @@ def worker(content,workerNo):
while flag:
counter = 0
for eachT in threads:
if eachT.isAlive() == False:
if not eachT.is_alive():
counter = counter + 1
if counter == NUMBER_OF_WORKERS:
flag = False
Expand All @@ -161,42 +184,21 @@ def worker(content,workerNo):
dirs = os.listdir(os.getcwd())
for eachDir in dirs:
if 'W' in eachDir:
FinalList = os.listdir(os.path.join(os.getcwd(),eachDir))
FinalList = os.listdir(os.path.join(os.getcwd(), eachDir))
for eachFile in FinalList:
if ".jpg" in eachFile:
fPath = os.path.join(os.getcwd(),eachDir,eachFile)
fPath = os.path.join(os.getcwd(), eachDir, eachFile)
fID = eachFile.split(".")[0]
allPaths.append(fPath)
allIDs.append(fID)
logging.debug(str(datetime.datetime.now()) + " - write to disc ")

FINAL = codecs.open("FINAL.dat","w","utf-8")
FINAL = codecs.open("FINAL.dat", "w", "utf-8")
for eachLine in downloadContent:
tempID = eachLine.split(",")[0]
try:
tempIndex = allIDs.index(tempID)
FINAL.write(eachLine[:-1]+","+str(allPaths[tempIndex])+"\n")
FINAL.write(eachLine[:-1] + "," + str(allPaths[tempIndex]) + "\n")
except:
FINAL.write(eachLine[:-1]+","+"-\n")
FINAL.write(eachLine[:-1] + "," + "-\n")
FINAL.close()