OSDN > Entwickler > larry77 > Kammer > myprojects-hg-reborn > File Details

myprojects-hg-reborn
Fork

(Original-Repository, kein Fork Herkunft)

File Info

Rev.	0bc36b41d1b3964a397b5e569e9999af5bdd776c
Größe	2,176 Bytes
Zeit	2021-10-01 17:38:14
Autor	Lorenzo Isella
Log Message	A simple script to perform some basic text mining.

Content

Im Rohformat exportieren

rm(list=ls())

library(tidyverse)
library(stringr)
library(stringi)
library(openxlsx)
library(janitor)
library(tidytext)


source("/home/lorenzo/myprojects-hg/R-codes/stat_lib.R")

df_ini <- read_excel("etsi_wax_melts.xlsx")


df <- df_ini %>%
    mutate(across(everything(), ~ remove_special_char(.x, " "))) %>%
    filter(complete.cases(.))


titles <- df %>%
    select(title)

descriptions <- df %>%
    select(description)

tidy_titles <- titles %>%
    unnest_tokens(word, title)

tidy_descriptions <- descriptions %>%
    unnest_tokens(word, description)


data(stop_words)

tidy_titles <- tidy_titles %>%
    anti_join(stop_words)

tidy_descriptions <- tidy_descriptions %>%
    anti_join(stop_words)



word_count_titles <- tidy_titles %>%
    count(word, sort = TRUE)  


word_count_descriptions <- tidy_descriptions %>%
    count(word, sort = TRUE) 

### work on titles

titles_bigrams <- titles %>%
    unnest_tokens(bigram, title, token = "ngrams", n = 2)

titles_bigrams_separated <- titles_bigrams %>%
    separate(bigram, c("word1", "word2"), sep = " ")

titles_bigrams_filtered <- titles_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)


titles_bigram_counts <- titles_bigrams_filtered %>% 
    count(word1, word2, sort = TRUE)





### work on descriptions

descriptions_bigrams <- descriptions %>%
    unnest_tokens(bigram, description, token = "ngrams", n = 2)

descriptions_bigrams_separated <- descriptions_bigrams %>%
    separate(bigram, c("word1", "word2"), sep = " ")

descriptions_bigrams_filtered <- descriptions_bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)



descriptions_bigram_counts <- descriptions_bigrams_filtered %>% 
    count(word1, word2, sort = TRUE)



trigram_description <- descriptions %>%
    unnest_tokens(trigram, description, token = "ngrams", n = 3) %>%
    separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
    filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
    count(word1, word2, word3, sort = TRUE)







print("So far so good")

myprojects-hg-reborn Fork

Tags

Frequently used words (click to add to your profile)

File Info

Content

myprojects-hg-reborn
Fork