From 2da5ca2500823f26347a6ead300aee4b63dde4fc Mon Sep 17 00:00:00 2001 From: XLC <631@qq.com> Date: Sun, 15 Dec 2019 23:23:38 +0800 Subject: [PATCH] update --- ...櫒瀛︿範-鍧﹀厠鍗槦鍥剧墖璇嗗埆鍒嗙被.md | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/chapter5/5.2Spark鏈哄櫒瀛︿範-鍧﹀厠鍗槦鍥剧墖璇嗗埆鍒嗙被.md b/chapter5/5.2Spark鏈哄櫒瀛︿範-鍧﹀厠鍗槦鍥剧墖璇嗗埆鍒嗙被.md index bdb9d92..daa4da3 100644 --- a/chapter5/5.2Spark鏈哄櫒瀛︿範-鍧﹀厠鍗槦鍥剧墖璇嗗埆鍒嗙被.md +++ b/chapter5/5.2Spark鏈哄櫒瀛︿範-鍧﹀厠鍗槦鍥剧墖璇嗗埆鍒嗙被.md @@ -148,11 +148,48 @@ print(type(img)) #### 5.2.3 Spark 鍔犺浇鏁版嵁闆 - 涓婇潰鎴戜滑宸茬粡鎶婃暟鎹泦澶勭悊瀹屾瘯涔嬪悗锛岀劧鍚庢垜浠彲浠ラ氳繃`Spark`鏉ュ姞杞芥暟鎹泦銆 ``` +import cv2 +import os +import pandas as pd +from pyspark.mllib.evaluation import BinaryClassificationMetrics +from pyspark.sql import SparkSession +from pyspark.ml.linalg import Vectors +from pyspark.ml.classification import LogisticRegression + +def get_file_path(root_path): + file_list = [] + dir_or_files = os.listdir(root_path) + for dir_file in dir_or_files: + dir_file_path = os.path.join(root_path, dir_file) + file_list.append(dir_file_path) + return file_list + + +def img2vector(imgfilename): + img = cv2.imread(imgfilename, cv2.IMREAD_GRAYSCALE) + rows, columns = img.shape + img = img.reshape(rows * columns) + # print(type(img)) + return Vectors.dense(img) + +root_path = r"testing2" +fileList = get_file_path(root_path) +vectos = [] +for x in fileList: + vector = img2vector(x) + if "btr-70" in x: + label = 0 + elif "t-72" in x: + label = 1 + else: + label = 2 + vectos.append((vector, label)) +df = pd.DataFrame(vectos, columns=['features', 'label']) spark = SparkSession.builder.master("local[*]").appName("demo").getOrCreate() +spark.sparkContext.setLogLevel("error") sparkDF = spark.createDataFrame(df) # df鏄垜浠氳繃pandas鏋勫缓鐨凞ataframe ```